From 6f79480bba2f910bbc0b52187b98a0f56e10bb4e Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 15:17:42 -0700 Subject: [PATCH 01/12] making module factory dynamic --- Interfaces/IQcModule.cs | 2 + Modules/AlignmentStatistics.cs | 48 ++++++++++++++++++++++-- Modules/BasicStatistics.cs | 2 + Modules/KmerContent.cs | 2 + Modules/MeanQualityDistribution.cs | 2 + Modules/ModuleFactory.cs | 53 +++++++++++++++++++-------- Modules/NCountsAtPosition.cs | 3 ++ Modules/PerPositionQuality.cs | 3 ++ Modules/PerPositionSequenceContent.cs | 5 +++ Modules/PerSequenceGcContent.cs | 3 ++ Modules/QualityDistributionByBase.cs | 5 +++ Modules/SequenceLengthDistribution.cs | 3 ++ Program.cs | 12 ++++-- 13 files changed, 120 insertions(+), 23 deletions(-) diff --git a/Interfaces/IQcModule.cs b/Interfaces/IQcModule.cs index 293be39..a9392eb 100644 --- a/Interfaces/IQcModule.cs +++ b/Interfaces/IQcModule.cs @@ -7,6 +7,8 @@ public interface IQcModule string Description { get; } + bool IsEnabledForAll { get; } + void ProcessSequence(Sequence sequence); void Reset(); diff --git a/Modules/AlignmentStatistics.cs b/Modules/AlignmentStatistics.cs index bd69d23..ae9b510 100644 --- a/Modules/AlignmentStatistics.cs +++ b/Modules/AlignmentStatistics.cs @@ -1,10 +1,15 @@ +using System.Collections.Generic; +using System.Linq; + namespace Ovation.FasterQC.Net { public class AlignmentStatistics : IQcModule { private ulong sequenceCount; + private ulong baseCount; + private ulong paired; private ulong aligned; @@ -25,16 +30,39 @@ public class AlignmentStatistics : IQcModule private ulong opticalDuplicate; + private ulong alignedBases; + + private readonly Dictionary readLengthHistogram = new(); + public string Name => "alignmentStatistics"; + public bool IsEnabledForAll => true; + public string Description => "Calculates alignment statistics for SAM/BAM files"; public void ProcessSequence(Sequence sequence) { sequenceCount++; - - if ((sequence.ReadFlag & ReadFlag.Paired) != 0) paired++; - if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) aligned++; + baseCount += (ulong)sequence.Read.Length; + + if (readLengthHistogram.ContainsKey(sequence.Read.Length) == false) + { + readLengthHistogram.Add(sequence.Read.Length, new ReadPair()); + } + + var readPair = readLengthHistogram[sequence.Read.Length]; + + if ((sequence.ReadFlag & ReadFlag.Paired) != 0) + { + paired++; + readPair.Paired++; + } + if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) + { + aligned++; + alignedBases += (ulong)sequence.Read.Length; + readPair.AlignedAndPaired++; + } if ((sequence.ReadFlag & ReadFlag.AlignedAndPaired) == ReadFlag.AlignedAndPaired) alignedAndPaired++; if ((sequence.ReadFlag & ReadFlag.SegmentUnmapped) != 0) segmentUnmapped++; if ((sequence.ReadFlag & ReadFlag.NextSegmentUnmapped) != 0) nextSegmentUnmapped++; @@ -62,7 +90,19 @@ public void Reset() nextSegmentReverseComplemented, nonPrimaryAlignment, failedQualityChecks, - opticalDuplicate + opticalDuplicate, + alignedBases, + averageReadLength = (double)baseCount / (double)sequenceCount, + histogram = readLengthHistogram + .Select((k, v) => new ulong[] { (ulong)k.Key, k.Value.Paired, k.Value.AlignedAndPaired }) + .OrderBy(a => a[0]) }; + + class ReadPair + { + public ulong AlignedAndPaired { get; set; } + + public ulong Paired { get; set; } + } } } \ No newline at end of file diff --git a/Modules/BasicStatistics.cs b/Modules/BasicStatistics.cs index db24e2e..8761a0c 100644 --- a/Modules/BasicStatistics.cs +++ b/Modules/BasicStatistics.cs @@ -24,6 +24,8 @@ public class BasicStatistics : IQcModule public string Description => "Calculates basic quality statistics"; + public bool IsEnabledForAll => true; + public void ProcessSequence(Sequence sequence) { var sequenceLength = sequence.Read.Length; diff --git a/Modules/KmerContent.cs b/Modules/KmerContent.cs index be701dd..315d004 100644 --- a/Modules/KmerContent.cs +++ b/Modules/KmerContent.cs @@ -22,6 +22,8 @@ public class KMerContent : IQcModule public string Description => "Computes 4-mer counts across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/MeanQualityDistribution.cs b/Modules/MeanQualityDistribution.cs index 25f374e..ee4f9b5 100644 --- a/Modules/MeanQualityDistribution.cs +++ b/Modules/MeanQualityDistribution.cs @@ -20,6 +20,8 @@ public class MeanQualityDistribution : IQcModule public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/ModuleFactory.cs b/Modules/ModuleFactory.cs index 9e8a1d4..d3eb538 100644 --- a/Modules/ModuleFactory.cs +++ b/Modules/ModuleFactory.cs @@ -1,35 +1,58 @@ +using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; using Ovation.FasterQC.Net.Utils; namespace Ovation.FasterQC.Net.Modules { public static class ModuleFactory { - private static readonly Dictionary moduleMap = new() + private static Dictionary? moduleMap; + + private static Dictionary ModuleMap { - ["AlignmentStatistics"] = new AlignmentStatistics(), - ["BasicStatistics"] = new BasicStatistics(), - ["KMerContent"] = new KMerContent(), - ["NCountsAtPosition"] = new NCountsAtPosition(), - ["PerPositionSequenceContent"] = new PerPositionSequenceContent(), - ["PerSequenceGcContent"] = new PerSequenceGcContent(), - ["QualityDistributionByBase"] = new QualityDistributionByBase(), - ["MeanQualityDistribution"] = new MeanQualityDistribution(), - ["SequenceLengthDistribution"] = new SequenceLengthDistribution(), - ["PerPositionQuality"] = new PerPositionQuality(), - }; + get + { + if (moduleMap == null) + { + moduleMap = new Dictionary(); + + var modules = Assembly.GetExecutingAssembly() + .GetTypes() + .Where(t => string.IsNullOrEmpty(t.Namespace) == false && t.GetInterface(nameof(IQcModule)) != null) + .Select(t => Activator.CreateInstance(t)) + .Cast(); + + foreach (var module in modules) + { + moduleMap.Add(module.GetType().Name, module); + } + } + + return moduleMap; + } + } public static IEnumerable Create(CliOptions settings) { if (settings.ModuleNames.Any() == false || settings.ModuleNames.First() == "all") { - settings.ModuleNames = moduleMap.Keys; - return moduleMap.Values; + var moduleNames = new List(); + var modules = new List(); + + foreach (var module in ModuleMap.Where(m => m.Value.IsEnabledForAll == true)) + { + moduleNames.Add(module.Key); + modules.Add(module.Value); + } + + settings.ModuleNames = moduleNames; + return modules; } else { - return settings.ModuleNames.Select(n => moduleMap[n]); + return settings.ModuleNames.Select(n => ModuleMap[n]); } } } diff --git a/Modules/NCountsAtPosition.cs b/Modules/NCountsAtPosition.cs index d458a3a..91ee16c 100644 --- a/Modules/NCountsAtPosition.cs +++ b/Modules/NCountsAtPosition.cs @@ -5,12 +5,15 @@ namespace Ovation.FasterQC.Net public class NCountsAtPosition : IQcModule { private int[] nCounts = Array.Empty(); + private int[] notNCounts = Array.Empty(); public string Name => "nPercentages"; public string Description => "Calculates N counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionQuality.cs b/Modules/PerPositionQuality.cs index 88e0e2c..90e1ef9 100644 --- a/Modules/PerPositionQuality.cs +++ b/Modules/PerPositionQuality.cs @@ -8,12 +8,15 @@ public class PerPositionQuality : IQcModule private QualityMetric[] qualities = Array.Empty(); private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; public string Name => "perPositionQuality"; public string Description => "Calculates the per-position quality metrics"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionSequenceContent.cs b/Modules/PerPositionSequenceContent.cs index c3f2ed0..e4e213d 100644 --- a/Modules/PerPositionSequenceContent.cs +++ b/Modules/PerPositionSequenceContent.cs @@ -6,8 +6,11 @@ namespace Ovation.FasterQC.Net public class PerPositionSequenceContent : IQcModule { private ulong[] aCounts = Array.Empty(); + private ulong[] cCounts = Array.Empty(); + private ulong[] tCounts = Array.Empty(); + private ulong[] gCounts = Array.Empty(); private ulong sequenceCount; @@ -15,6 +18,8 @@ public class PerPositionSequenceContent : IQcModule public string Description => "Calculates ATCG counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerSequenceGcContent.cs b/Modules/PerSequenceGcContent.cs index 1fd9687..9ba1f09 100644 --- a/Modules/PerSequenceGcContent.cs +++ b/Modules/PerSequenceGcContent.cs @@ -6,12 +6,15 @@ namespace Ovation.FasterQC.Net public class PerSequenceGcContent : IQcModule { private ulong[] gcCounts = Array.Empty(); + private ulong sequenceCount; public string Name => "gcDistribution"; public string Description => "Distribution of GC content percentages"; + public bool IsEnabledForAll => true; + public object Data => gcCounts.Select(a => Math.Round((double)a / (double)sequenceCount * 100.0, 3)); public void ProcessSequence(Sequence sequence) diff --git a/Modules/QualityDistributionByBase.cs b/Modules/QualityDistributionByBase.cs index 285ce80..7b26f49 100644 --- a/Modules/QualityDistributionByBase.cs +++ b/Modules/QualityDistributionByBase.cs @@ -8,8 +8,11 @@ public class QualityDistributionByBase : IQcModule private const byte ILLUMINA_BASE_ADJUSTMENT = 33; private readonly ulong[] aQuality = new ulong[128]; + private readonly ulong[] cQuality = new ulong[128]; + private readonly ulong[] tQuality = new ulong[128]; + private readonly ulong[] gQuality = new ulong[128]; private byte lowestScore = byte.MaxValue; @@ -20,6 +23,8 @@ public class QualityDistributionByBase : IQcModule public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/SequenceLengthDistribution.cs b/Modules/SequenceLengthDistribution.cs index 27547f2..9fc5412 100644 --- a/Modules/SequenceLengthDistribution.cs +++ b/Modules/SequenceLengthDistribution.cs @@ -6,6 +6,7 @@ namespace Ovation.FasterQC.Net public class SequenceLengthDistribution : IQcModule { private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; private readonly IDictionary lengths = new Dictionary(); @@ -14,6 +15,8 @@ public class SequenceLengthDistribution : IQcModule public string Description => "Calculates the sequence length distributions"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Program.cs b/Program.cs index 47d28a2..51626be 100644 --- a/Program.cs +++ b/Program.cs @@ -49,7 +49,7 @@ private void Run() On(Settings.ShowProgress, () => progressBar = new TimedSequenceProgressBar(sequenceReader)); On(Settings.Verbose, () => Console.Error.WriteLine($"Processing {Settings.InputFilename}...")); - while (sequenceReader.ReadSequence(out Sequence? sequence) && sequenceReader.SequencesRead < Settings.ReadLimit) + while (sequenceReader.SequencesRead < Settings.ReadLimit && sequenceReader.ReadSequence(out Sequence? sequence)) { ArgumentNullException.ThrowIfNull(sequence); @@ -70,9 +70,13 @@ private void Run() Dictionary? results = new() { - ["_modules"] = Settings.ModuleNames, - ["_inputFilename"] = Settings.InputFilename, - ["_outputFilename"] = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + ["_metadata"] = new + { + _modules = Settings.ModuleNames, + _inputFilename = Settings.InputFilename, + _outputFilename = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + _sequences = sequenceReader.SequencesRead + } }; foreach (IQcModule? module in modules) From d9562ce2dd8f69c1ea8ba02ac00fd913006283ca Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 15:35:04 -0700 Subject: [PATCH 02/12] make module list available via --help --- Modules/ModuleFactory.cs | 2 +- Program.cs | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/Modules/ModuleFactory.cs b/Modules/ModuleFactory.cs index d3eb538..dea2b79 100644 --- a/Modules/ModuleFactory.cs +++ b/Modules/ModuleFactory.cs @@ -10,7 +10,7 @@ public static class ModuleFactory { private static Dictionary? moduleMap; - private static Dictionary ModuleMap + public static Dictionary ModuleMap { get { diff --git a/Program.cs b/Program.cs index 51626be..de896a9 100644 --- a/Program.cs +++ b/Program.cs @@ -1,8 +1,10 @@ using System; using System.Collections.Generic; using System.IO; +using System.Text; using System.Text.Json; using CommandLine; +using CommandLine.Text; using Ovation.FasterQC.Net.Modules; using Ovation.FasterQC.Net.Readers; using Ovation.FasterQC.Net.Utils; @@ -31,12 +33,36 @@ private static void Main(string[] args) } ); - _ = parser.ParseArguments(args) - .WithParsed(o => - { - Settings = o; - new Program().Run(); - }); + var parserResult = parser.ParseArguments(args); + + parserResult.WithParsed(o => + { + Settings = o; + new Program().Run(); + }) + .WithNotParsed(errs => DisplayHelp(parserResult, errs)); + } + + static void DisplayHelp(ParserResult result, IEnumerable errs) + { + var moduleList = ModuleFactory.ModuleMap; + var sb = new StringBuilder("List of available modules:").AppendLine(); + + foreach (var module in moduleList) + { + sb.AppendLine($"\t{module.Key} -> {module.Value.Description}"); + } + + var helpText = HelpText.AutoBuild(result, h => + { + h.AdditionalNewLineAfterOption = false; + h.MaximumDisplayWidth = 120; + h.AddPostOptionsText(sb.ToString()); + + return HelpText.DefaultParsingErrorsHandler(result, h); + }, e => e); + + Console.Error.WriteLine(helpText); } private void Run() From 51c11de30e2ed7c588eb2a57e1f83de615fadd8e Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 15:36:38 -0700 Subject: [PATCH 03/12] make module list available via --help --- Program.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Program.cs b/Program.cs index de896a9..4368556 100644 --- a/Program.cs +++ b/Program.cs @@ -43,10 +43,10 @@ private static void Main(string[] args) .WithNotParsed(errs => DisplayHelp(parserResult, errs)); } - static void DisplayHelp(ParserResult result, IEnumerable errs) + static void DisplayHelp(ParserResult result, IEnumerable _) { var moduleList = ModuleFactory.ModuleMap; - var sb = new StringBuilder("List of available modules:").AppendLine(); + var sb = new StringBuilder("List of available modules for --modules:").AppendLine(); foreach (var module in moduleList) { From 3849d00b131fec2a2a8b9ff98324792352f2dfe4 Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 15:39:18 -0700 Subject: [PATCH 04/12] make module list available via --help --- Program.cs | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Program.cs b/Program.cs index 4368556..d1d5f20 100644 --- a/Program.cs +++ b/Program.cs @@ -45,22 +45,23 @@ private static void Main(string[] args) static void DisplayHelp(ParserResult result, IEnumerable _) { - var moduleList = ModuleFactory.ModuleMap; var sb = new StringBuilder("List of available modules for --modules:").AppendLine(); - - foreach (var module in moduleList) + foreach (var module in ModuleFactory.ModuleMap) { sb.AppendLine($"\t{module.Key} -> {module.Value.Description}"); } - var helpText = HelpText.AutoBuild(result, h => - { - h.AdditionalNewLineAfterOption = false; - h.MaximumDisplayWidth = 120; - h.AddPostOptionsText(sb.ToString()); - - return HelpText.DefaultParsingErrorsHandler(result, h); - }, e => e); + var helpText = HelpText.AutoBuild(result, + h => + { + h.AdditionalNewLineAfterOption = false; + h.MaximumDisplayWidth = 120; + h.AddPostOptionsText(sb.ToString()); + + return HelpText.DefaultParsingErrorsHandler(result, h); + }, + e => e + ); Console.Error.WriteLine(helpText); } From 9db0c6409fc9c322c2db2f532057a4f247877e5c Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Thu, 19 May 2022 08:08:25 -0700 Subject: [PATCH 05/12] additional flag cleanup --- Models/ReadFlag.cs | 6 ++- Modules/AlignmentStatistics.cs | 98 +++++++++++++++++++++++++++------- 2 files changed, 83 insertions(+), 21 deletions(-) diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs index b9625b0..effa7c3 100644 --- a/Models/ReadFlag.cs +++ b/Models/ReadFlag.cs @@ -47,6 +47,8 @@ public enum ReadFlag : ushort /// LastSegment = 128, + EmbeddedSegment = FirstSegment | LastSegment, + /// /// not primary alignment /// @@ -65,6 +67,8 @@ public enum ReadFlag : ushort /// /// supplementary alignment (e.g. aligner specific, could be a portion of a split read or a tied region) /// - SupplementaryAlignment = 2048 + SupplementaryAlignment = 2048, + + SecondaryAlignment = NotPrimaryAlignment | SupplementaryAlignment } } \ No newline at end of file diff --git a/Modules/AlignmentStatistics.cs b/Modules/AlignmentStatistics.cs index ae9b510..a080c94 100644 --- a/Modules/AlignmentStatistics.cs +++ b/Modules/AlignmentStatistics.cs @@ -24,6 +24,18 @@ public class AlignmentStatistics : IQcModule private ulong nextSegmentReverseComplemented; + private ulong firstSegment; + + private ulong lastSegment; + + private ulong embeddedSegment; + + private ulong unknownIndex; + + private ulong primaryAlignment; + + private ulong secondaryAlignment; + private ulong nonPrimaryAlignment; private ulong failedQualityChecks; @@ -52,6 +64,7 @@ public void ProcessSequence(Sequence sequence) var readPair = readLengthHistogram[sequence.Read.Length]; + // revisit the logic here based on https://samtools.github.io/hts-specs/SAMv1.pdf if ((sequence.ReadFlag & ReadFlag.Paired) != 0) { paired++; @@ -68,9 +81,23 @@ public void ProcessSequence(Sequence sequence) if ((sequence.ReadFlag & ReadFlag.NextSegmentUnmapped) != 0) nextSegmentUnmapped++; if ((sequence.ReadFlag & ReadFlag.ReverseComplemented) != 0) reverseComplemented++; if ((sequence.ReadFlag & ReadFlag.NextSegmentReverseComplemented) != 0) nextSegmentReverseComplemented++; + + if ((sequence.ReadFlag & ReadFlag.FirstSegment) != 0) firstSegment++; + if ((sequence.ReadFlag & ReadFlag.LastSegment) != 0) lastSegment++; + if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == ReadFlag.EmbeddedSegment) embeddedSegment++; + if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == 0) unknownIndex++; + if ((sequence.ReadFlag & ReadFlag.NotPrimaryAlignment) != 0) nonPrimaryAlignment++; if ((sequence.ReadFlag & ReadFlag.FailedQualityChecks) != 0) failedQualityChecks++; if ((sequence.ReadFlag & ReadFlag.OpticalDuplicate) != 0) opticalDuplicate++; + if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == 0) + { + primaryAlignment++; + } + if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == ReadFlag.SecondaryAlignment) + { + secondaryAlignment++; + } } public void Reset() @@ -78,31 +105,62 @@ public void Reset() sequenceCount = 0; } - public object Data => new + public object Data { - sequenceCount, - paired, - aligned, - alignedAndPaired, - segmentUnmapped, - nextSegmentUnmapped, - reverseComplemented, - nextSegmentReverseComplemented, - nonPrimaryAlignment, - failedQualityChecks, - opticalDuplicate, - alignedBases, - averageReadLength = (double)baseCount / (double)sequenceCount, - histogram = readLengthHistogram - .Select((k, v) => new ulong[] { (ulong)k.Key, k.Value.Paired, k.Value.AlignedAndPaired }) - .OrderBy(a => a[0]) - }; + get + { + var minReadLength = readLengthHistogram.Keys.Min(); + var maxReadLength = readLengthHistogram.Keys.Max(); + + for (var readLength = minReadLength; readLength < maxReadLength; readLength++) + { + if (readLengthHistogram.ContainsKey(readLength) == false) + { + readLengthHistogram.Add(readLength, new ReadPair()); + } + } + + return new + { + sequenceCount, + paired, + aligned, + alignedAndPaired, + segmentUnmapped, + nextSegmentUnmapped, + reverseComplemented, + nextSegmentReverseComplemented, + firstSegment, + lastSegment, + embeddedSegment, + unknownIndex, + primaryAlignment, + secondaryAlignment, + nonPrimaryAlignment, + failedQualityChecks, + opticalDuplicate, + alignedBases, + averageReadLength = (double)baseCount / (double)sequenceCount, + histogram = new + { + minReadLength, + maxReadLength, + paired = readLengthHistogram + .OrderBy(k => k.Key) + .Select((k, v) => k.Value.Paired), + unpaired = readLengthHistogram + .OrderBy(k => k.Key) + .Select((k, v) => k.Value.AlignedAndPaired) + } + }; + } + } class ReadPair { - public ulong AlignedAndPaired { get; set; } - public ulong Paired { get; set; } + + public ulong AlignedAndPaired { get; set; } } } } \ No newline at end of file From 0d7b86bd5aa2af2b8e2b46fa452dc0d74d95210e Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Thu, 19 May 2022 09:53:40 -0700 Subject: [PATCH 06/12] fixed mqd computations --- .vscode/launch.json | 3 +-- Modules/BasicStatistics.cs | 4 +++- Modules/MeanQualityDistribution.cs | 27 ++++++++++++++------------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 6aa53df..8e867fd 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -23,8 +23,7 @@ "-o", "./tmp/bob.json", "-m", - "BasicStatistics", - "NCountsAtPosition" + "MeanQualityDistribution" ], "cwd": "${workspaceFolder}", // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console diff --git a/Modules/BasicStatistics.cs b/Modules/BasicStatistics.cs index 8761a0c..81f57e4 100644 --- a/Modules/BasicStatistics.cs +++ b/Modules/BasicStatistics.cs @@ -4,6 +4,8 @@ namespace Ovation.FasterQC.Net { public class BasicStatistics : IQcModule { + private const byte ILLUMINA_BASE_ADJUSTMENT = 33; + private ulong sequenceCount; private int minimumReadLength = int.MaxValue; @@ -87,7 +89,7 @@ public void Reset() nCount, xCount, - minimumQuality, + minimumQuality = Math.Min(minimumQuality - ILLUMINA_BASE_ADJUSTMENT, 0), gcContent = Math.Round((double)(cCount + gCount) / (double)totalBases * 100.0, 3), diff --git a/Modules/MeanQualityDistribution.cs b/Modules/MeanQualityDistribution.cs index ee4f9b5..e4a0358 100644 --- a/Modules/MeanQualityDistribution.cs +++ b/Modules/MeanQualityDistribution.cs @@ -12,9 +12,9 @@ public class MeanQualityDistribution : IQcModule // position and fix it when someone asks private readonly ulong[] qualityScores = new ulong[128]; - private byte lowestScore = byte.MaxValue; + private uint lowestMean = uint.MaxValue; - private byte highestScore = byte.MinValue; + private uint highestMean = uint.MinValue; public string Name => "meanQualityDistribution"; @@ -28,9 +28,9 @@ public object Data { return new { - lowestScore = lowestScore - ILLUMINA_BASE_ADJUSTMENT, - highestScore = highestScore - ILLUMINA_BASE_ADJUSTMENT, - distribution = qualityScores.Skip(lowestScore).Take(highestScore - lowestScore + 1) + lowestMean = lowestMean - ILLUMINA_BASE_ADJUSTMENT, + highestMean = highestMean - ILLUMINA_BASE_ADJUSTMENT, + distribution = qualityScores.Skip((int)lowestMean).Take((int)(highestMean - lowestMean + 1)) }; } } @@ -43,16 +43,17 @@ public void ProcessSequence(Sequence sequence) var count = 0; var qual = sequence.Quality; - for (var q = 0; q < sequenceLength; q++) + for (var i = 0; i < sequenceLength; i++) { - lowestScore = Math.Min(lowestScore, qual[q]); - highestScore = Math.Max(highestScore, qual[q]); - - sum += qual[q]; + sum += qual[i]; count++; } - var mean = sum / count; + var mean = (uint)(sum / count); + + lowestMean = Math.Min(lowestMean, mean); + highestMean = Math.Max(highestMean, mean); + qualityScores[mean]++; } @@ -63,8 +64,8 @@ public void Reset() qualityScores[p] = 0; } - lowestScore = byte.MaxValue; - highestScore = byte.MinValue; + lowestMean = uint.MaxValue; + highestMean = uint.MinValue; } } } \ No newline at end of file From 627326f445f275878dc4995bd1e9f8b4c25e8722 Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Thu, 19 May 2022 12:21:39 -0700 Subject: [PATCH 07/12] adding graph script --- R/fasterqc.R | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 R/fasterqc.R diff --git a/R/fasterqc.R b/R/fasterqc.R new file mode 100644 index 0000000..1de40d1 --- /dev/null +++ b/R/fasterqc.R @@ -0,0 +1,284 @@ +library(tidyverse) +library(jsonlite) +library(tidyjson) +library(ggplot2) +library(hrbrthemes) +library(tidyr) +library(stringr) +library(reshape2) +library(grid) + +# Multiple plot function +# +# ggplot objects can be passed in ..., or to plotlist +# (as a list of ggplot objects) +# - cols: Number of columns in layout +# - layout: A matrix specifying the layout. If present, 'cols' is ignored. +# +# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), +# then plot 1 will go in the upper left, 2 will go in the upper right, and +# 3 will go all the way across the bottom. +# +multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) { + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + num_plots <- length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(num_plots / cols)), + ncol = cols, nrow = ceiling(num_plots / cols) + ) + } + + if (num_plots == 1) { + print(plots[[1]]) + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:num_plots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport( + layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col + )) + } + } +} + +setwd("/Users/michaeljon/src/ovation/fasterqc.net/tmp") +sample <- "in3257_2_S1_sorted" + +# load the data +json <- + jsonlite::fromJSON(readLines(paste(getwd(), "/", sample, ".json", sep = ""))) + +# plot the ACTGs +percent_actg <- data.frame( + as = json$baseCounts$aPercentage, + ts = json$baseCounts$tPercentage, + cs = json$baseCounts$cPercentage, + gs = json$baseCounts$gPercentage +) + +plt <- ggplot(percent_actg, aes(x = seq_along(as))) + + geom_line(aes(y = as, color = "%A")) + + geom_line(aes(y = ts, color = "%T")) + + geom_line(aes(y = cs, color = "%C")) + + geom_line(aes(y = gs, color = "%G")) + + coord_cartesian(ylim = c(0, 100)) + + labs( + color = "Nucleotide", + title = "Percentage of ACTG by sequence position", + subtitle = paste("Sample ", sample, sep = ""), + x = "Position in read", + y = "% of ACTG", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".pct-actg.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +# plot the N percentages +n_percentages <- data.frame( + n = json$nPercentages$percentages +) + +plt <- ggplot(n_percentages, aes(x = seq_along(n))) + + geom_line(aes(y = n)) + + coord_cartesian(ylim = c(0, 1)) + + labs( + title = "Percent of N reads", + subtitle = paste("Sample ", sample, sep = ""), + x = "Position in read", + y = "% N", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".pct-n.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +# plot GC distribution +gc_distribution <- data.frame( + gc = json$gcDistribution +) + +plt <- ggplot(gc_distribution, aes(x = seq_along(gc))) + + geom_line(aes(y = gc, color = "% GC")) + + # geom_line(aes(y = cumsum(gc), color = "Cumulative")) + + scale_y_continuous(labels = scales::comma) + + labs( + color = "", + title = "GC distribution", + subtitle = paste("Sample ", sample, sep = ""), + x = "Position in read", + y = "% GC", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".gc-dist.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +# plot quality distributions +qualities_by_base <- data.frame( + as = json$qualityDistributionByBase$aDistribution, + ts = json$qualityDistributionByBase$tDistribution, + cs = json$qualityDistributionByBase$cDistribution, + gs = json$qualityDistributionByBase$gDistribution +) + +plt <- ggplot(qualities_by_base, aes(x = seq_along(as))) + + geom_line(aes(y = as, color = "%A")) + + geom_line(aes(y = ts, color = "%T")) + + geom_line(aes(y = cs, color = "%C")) + + geom_line(aes(y = gs, color = "%G")) + + scale_y_continuous(labels = scales::comma, trans = "log10") + + labs( + color = "Nucleotide", + title = "Quality distribution by base", + subtitle = paste("Sample ", sample, sep = ""), + x = "Quality (phred)", + y = "# of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".base-qual-dist.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +# plot quality distributions +quality_distribution <- data.frame( + qual = json$meanQualityDistribution$distribution, + low = json$meanQualityDistribution$lowestMean, + high = json$meanQualityDistribution$highestMean +) + +lowest_score <- min(quality_distribution$low) +highest_score <- max(quality_distribution$high) + +plt <- ggplot(quality_distribution, aes(x = lowest_score:highest_score)) + + geom_line(aes(y = qual)) + + scale_y_continuous(labels = scales::comma, trans = "log10") + + labs( + title = "Mean quality distribution", + subtitle = paste("Sample ", sample, sep = ""), + x = "Quality (phred)", + y = "# of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".mean-qual-dist.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +sequence_length_distribution <- data.frame( + len = json$sequenceLengthDistribution$distribution +) + +plt <- ggplot(sequence_length_distribution, aes(x = seq_along(len))) + + geom_line(aes(y = len)) + + scale_y_continuous(labels = scales::comma, trans = "log10") + + labs( + title = "Read count by length", + subtitle = paste("Sample ", sample, sep = ""), + x = "Length of read", + y = "# of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".readlen.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +per_position_qualities <- data.frame( + json$perPositionQual |> select(-distribution) +) + +lowest_score <- min(per_position_qualities$lowestScore) +highest_score <- min(per_position_qualities$highestScore) + +plt <- ggplot(per_position_qualities, aes(x = position)) + + geom_line(aes(y = mean, color = "mean")) + + geom_smooth(aes(y = lowest_score, color = "min"), level = 0.95) + + geom_smooth(aes(y = highest_score, color = "max"), level = 0.95) + + coord_cartesian(ylim = c(lowest_score, highest_score)) + + scale_y_continuous(labels = scales::comma) + + labs( + color = "", + title = "Read quality by position", + subtitle = paste("Sample ", sample, sep = ""), + x = "Position in read", + y = "Phred score", + caption = "Species: SARS-CoV-2", + ) + + theme_ipsum() + +png(paste(sample, ".qual.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +histogram <- data.frame( + paired = json$alignmentStatistics$histogram$paired, + unpaired = json$alignmentStatistics$histogram$unpaired +) + +minimum_read_length <- json$alignmentStatistics$histogram$minReadLength +maximum_read_length <- json$alignmentStatistics$histogram$maxReadLength + +histogram$lengths <- minimum_read_length:maximum_read_length + +plt <- ggplot(histogram, aes(x = lengths)) + + geom_col(aes(y = paired), fill = "#ff7f7f") + + scale_y_continuous(labels = scales::comma, trans = "log10") + + coord_cartesian(xlim = c(minimum_read_length, maximum_read_length)) + + labs( + color = "Legend", + title = "Paired segments", + subtitle = "Sample in3257_2_S1", + x = "Length of read", + y = "Number of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".paired.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +plt <- ggplot(histogram, aes(x = lengths)) + + geom_col(aes(y = unpaired), fill = "#7f7fff") + + scale_y_continuous(labels = scales::comma, trans = "log10") + + coord_cartesian(xlim = c(minimum_read_length, maximum_read_length)) + + labs( + color = "Legend", + title = "Aligned and paired segments", + subtitle = "Sample in3257_2_S1", + x = "Length of read", + y = "Number of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +# call multiplot(paired, unpaired) to put two of these on the page + +png(paste(sample, ".aligned.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() \ No newline at end of file From 64d1b39892e36c9dcd03c65172aeeb468ff47aeb Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Thu, 19 May 2022 14:35:43 -0700 Subject: [PATCH 08/12] adding simple q distribution calc and chart --- Modules/QualityDistribution.cs | 66 ++++++++++++++++++++++++++++++++++ R/fasterqc.R | 29 ++++++++++++--- 2 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 Modules/QualityDistribution.cs diff --git a/Modules/QualityDistribution.cs b/Modules/QualityDistribution.cs new file mode 100644 index 0000000..666cc47 --- /dev/null +++ b/Modules/QualityDistribution.cs @@ -0,0 +1,66 @@ +using System; +using System.Linq; + +namespace Ovation.FasterQC.Net +{ + public class QualityDistribution : IQcModule + { + private const byte ILLUMINA_BASE_ADJUSTMENT = 33; + + private readonly ulong[] qualities = new ulong[128]; + + private byte lowestScore = byte.MaxValue; + + private byte highestScore = byte.MinValue; + + public string Name => "qualityDistribution"; + + public string Description => "Calculates the distribution of quality scores"; + + public bool IsEnabledForAll => true; + + public object Data + { + get + { + return new + { + lowestScore = lowestScore - ILLUMINA_BASE_ADJUSTMENT, + highestScore = highestScore - ILLUMINA_BASE_ADJUSTMENT, + + lowestScoreUnadjusted = lowestScore, + highestScoreUnadjusted = highestScore, + + qualities = qualities.Skip(lowestScore).Take(highestScore - lowestScore + 1), + }; + } + } + + public void ProcessSequence(Sequence sequence) + { + var sequenceLength = sequence.Read.Length; + var quals = sequence.Quality; + + for (var i = 0; i < sequenceLength; i++) + { + var qual = quals[i]; + + lowestScore = Math.Min(lowestScore, quals[i]); + highestScore = Math.Max(highestScore, quals[i]); + + qualities[qual]++; + } + } + + public void Reset() + { + for (var p = 0; p < qualities.Length; p++) + { + qualities[p] = 0; + } + + lowestScore = byte.MaxValue; + highestScore = byte.MinValue; + } + } +} diff --git a/R/fasterqc.R b/R/fasterqc.R index 1de40d1..acfd357 100644 --- a/R/fasterqc.R +++ b/R/fasterqc.R @@ -31,7 +31,7 @@ multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) { # ncol: Number of columns of plots # nrow: Number of rows needed, calculated from # of cols layout <- matrix(seq(1, cols * ceiling(num_plots / cols)), - ncol = cols, nrow = ceiling(num_plots / cols) + ncol = cols, nrow = ceiling(num_plots / cols) ) } @@ -56,9 +56,8 @@ multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) { } setwd("/Users/michaeljon/src/ovation/fasterqc.net/tmp") -sample <- "in3257_2_S1_sorted" +sample <- "in3257_2_S1.sorted" -# load the data json <- jsonlite::fromJSON(readLines(paste(getwd(), "/", sample, ".json", sep = ""))) @@ -162,6 +161,28 @@ png(paste(sample, ".base-qual-dist.png", sep = ""), width = 800, height = 600) print(plt) dev.off() +# plot quality distributions +quality_dist <- data.frame( + qs = json$qualityDistribution$qualities +) + +plt <- ggplot(quality_dist, aes(x = seq_along(qs))) + + geom_col(aes(y = qs), fill = "#7faf7f") + + scale_y_continuous(labels = scales::comma, trans = "log10") + + labs( + color = "", + title = "Quality distribution", + subtitle = paste("Sample ", sample, sep = ""), + x = "Quality (phred)", + y = "# of bases (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".qual-dist.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + # plot quality distributions quality_distribution <- data.frame( qual = json$meanQualityDistribution$distribution, @@ -281,4 +302,4 @@ plt <- ggplot(histogram, aes(x = lengths)) + png(paste(sample, ".aligned.png", sep = ""), width = 800, height = 600) print(plt) -dev.off() \ No newline at end of file +dev.off() From eb1a733378500c4bb26c051ffabdc995a7163611 Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Thu, 19 May 2022 17:10:02 -0700 Subject: [PATCH 09/12] initial bam optional element parser --- .vscode/launch.json | 9 ++-- Models/BamAlignment.cs | 93 ++++++++++++++++++++++++++++++++++++++++++ Readers/BamReader.cs | 6 +++ 3 files changed, 103 insertions(+), 5 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 8e867fd..2b1df33 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,18 +12,17 @@ // If you have changed target frameworks, make sure to update the program path. "program": "${workspaceFolder}/bin/Debug/net6.0/Ovation.FasterQC.Net.dll", "args": [ - "-v", "-d", "-l", - "1000000", + "10", "-f", - "sam", + "bam", "-i", - "./tmp/in3257_2_S1.sorted.sam", + "./tmp/in3257_2_S1.sorted.bam", "-o", "./tmp/bob.json", "-m", - "MeanQualityDistribution" + "BasicStatistics" ], "cwd": "${workspaceFolder}", // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console diff --git a/Models/BamAlignment.cs b/Models/BamAlignment.cs index 28cdc0f..f87b7cb 100644 --- a/Models/BamAlignment.cs +++ b/Models/BamAlignment.cs @@ -1,4 +1,7 @@ +using System; +using System.ComponentModel; using System.Diagnostics.CodeAnalysis; +using System.Text; namespace Ovation.FasterQC.Net { @@ -37,4 +40,94 @@ public class BamAlignment public byte[] qual { get; set; } = null!; } + + [SuppressMessage("Code style", "IDE1006", Justification = "Names correspond to BAM structure field names")] + public class BamOptionalElement + { + public char[] tag { get; set; } + + public char val_type { get; set; } + + public object value { get; set; } = null!; + + public BamOptionalElement(byte[] block, ref int offset) + { + tag = Encoding.ASCII.GetChars(block, offset, 2); offset += 2; + val_type = Encoding.ASCII.GetChars(block, offset, 1)[0]; offset += 1; + + // consume the rest + switch (val_type) + { + case 'A': offset += 1; break; + + // byte + case 'c': offset += 1; break; + case 'C': offset += 1; break; + + // short + case 's': offset += 2; break; + case 'S': offset += 2; break; + + // int + case 'i': offset += 4; break; + case 'I': offset += 4; break; + + // float + case 'f': offset += 4; break; + + // null-terminated string + case 'Z': + while (block[offset++] != 0) ; + break; + + // null-terminated hex digit pairs + case 'H': + while (block[offset++] != 0) ; + break; + + // array of stuff + case 'B': + var subtype = Encoding.ASCII.GetChars(block, offset, 1)[0]; offset += 1; + var length = BitConverter.ToUInt32(new Span(block, offset, 4)); offset += 4; + + // consume the stuff + for (var element = 0; element < length; element++) + { + switch (subtype) + { + // byte + case 'c': offset += 1; break; + case 'C': offset += 1; break; + + // short + case 's': offset += 2; break; + case 'S': offset += 2; break; + + // int + case 'i': offset += 4; break; + case 'I': offset += 4; break; + + // float + case 'f': offset += 4; break; + } + } + + break; + } + } + + public override string ToString() + { + var sb = new StringBuilder(); + + sb.Append("tag: "); + sb.Append(tag[0]); + sb.Append(tag[1]); + + sb.Append(", type: "); + sb.Append(val_type); + + return sb.ToString(); + } + } } diff --git a/Readers/BamReader.cs b/Readers/BamReader.cs index fd548c6..6463cea 100644 --- a/Readers/BamReader.cs +++ b/Readers/BamReader.cs @@ -164,6 +164,12 @@ private BamAlignment ReadSequence() { bamAlignment.qual = Array.Empty(); } + offset += (int)bamAlignment.l_seq; + + while (offset < block_size) + { + _ = new BamOptionalElement(block, ref offset); + } return bamAlignment; } From 6bd79d0fb4ec20589a70d6aed4f222cef0195209 Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Fri, 20 May 2022 11:20:41 -0700 Subject: [PATCH 10/12] fixed percent reporting when limiting reads --- Utils/TimedSequenceProgressBar.cs | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/Utils/TimedSequenceProgressBar.cs b/Utils/TimedSequenceProgressBar.cs index 71e8290..51247fc 100644 --- a/Utils/TimedSequenceProgressBar.cs +++ b/Utils/TimedSequenceProgressBar.cs @@ -25,19 +25,27 @@ public TimedSequenceProgressBar(ISequenceReader sequenceReader) : base(100, "Pro public void Update(bool force = false) { - var read = sequenceReader.SequencesRead; - var percent = sequenceReader.ApproximateCompletion; + var sequencesRead = sequenceReader.SequencesRead; + var approximateCompletion = sequenceReader.ApproximateCompletion; - if (force || read % CliOptions.UpdatePeriod == 0) + if (force || sequencesRead % CliOptions.UpdatePeriod == 0) { - if (percent > 0) + // if we're limiting the number of reads then the reader's + // approximation will be incorrect (it's based on file positions), + // so we'll do the math ourselves + if (CliOptions.Settings.ReadLimit < ulong.MaxValue) { - var remainingTime = elapsedTime.ElapsedMilliseconds / percent * 100.0; + approximateCompletion = 100.0 * sequencesRead / CliOptions.Settings.ReadLimit; + } + + if (approximateCompletion > 0) + { + var remainingTime = elapsedTime.ElapsedMilliseconds / approximateCompletion * 100.0; EstimatedDuration = TimeSpan.FromMilliseconds(remainingTime); } - AsProgress().Report(percent / 100.0); - Message = $"{read.WithSsiUnits()} sequences completed"; + AsProgress().Report(approximateCompletion / 100.0); + Message = $"{sequencesRead.WithSsiUnits()} sequences completed"; } } } From b3118747eafb1a71ed0a508e9ef127db4252c8a1 Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Fri, 20 May 2022 11:21:07 -0700 Subject: [PATCH 11/12] adding a polyx module --- Modules/PolyX.cs | 105 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 Modules/PolyX.cs diff --git a/Modules/PolyX.cs b/Modules/PolyX.cs new file mode 100644 index 0000000..e382f51 --- /dev/null +++ b/Modules/PolyX.cs @@ -0,0 +1,105 @@ +namespace Ovation.FasterQC.Net +{ + public class PolyX : IQcModule + { + private const byte POLY_SIZE = 10; + + private readonly ulong[] polyA = new ulong[2]; + + private readonly ulong[] polyT = new ulong[2]; + + private readonly ulong[] polyC = new ulong[2]; + + private readonly ulong[] polyG = new ulong[2]; + + private ulong sequenceCount; + + private ulong tooShort; + + public string Name => "polyX"; + + public string Description => "Calculates the the number of reads with polyX heads / tails"; + + public bool IsEnabledForAll => true; + + public object Data + { + get + { + return new + { + sequenceCount, + tooShort, + target = POLY_SIZE, + head = new { a = polyA[0], t = polyT[0], c = polyC[0], g = polyG[0] }, + tail = new { a = polyA[1], t = polyT[1], c = polyC[1], g = polyG[1] }, + }; + } + } + + public void ProcessSequence(Sequence sequence) + { + sequenceCount++; + + if (sequence.Read.Length < POLY_SIZE) + { + tooShort++; + return; + } + + if (CheckPolyHead((byte)'A', sequence.Read) == true) polyA[0]++; + if (CheckPolyTail((byte)'A', sequence.Read) == true) polyA[1]++; + + if (CheckPolyHead((byte)'T', sequence.Read) == true) polyT[0]++; + if (CheckPolyTail((byte)'T', sequence.Read) == true) polyT[1]++; + + if (CheckPolyHead((byte)'C', sequence.Read) == true) polyC[0]++; + if (CheckPolyTail((byte)'C', sequence.Read) == true) polyC[1]++; + + if (CheckPolyHead((byte)'G', sequence.Read) == true) polyG[0]++; + if (CheckPolyTail((byte)'G', sequence.Read) == true) polyG[1]++; + } + + public void Reset() + { + polyA[0] = 0; polyA[1] = 0; + polyT[0] = 0; polyT[1] = 0; + polyC[0] = 0; polyC[1] = 0; + polyG[0] = 0; polyG[1] = 0; + + sequenceCount = 0; + tooShort = 0; + } + + private static bool CheckPolyHead(byte test, byte[] sequence) + { + // consider a loop unroll here + for (var i = 0; i < POLY_SIZE; i++) + { + if (sequence[i] != test) + { + return false; + } + } + + return true; + } + + private static bool CheckPolyTail(byte test, byte[] sequence) + { + var endOfRead = sequence.Length - 1; + var endOfPoly = sequence.Length - (POLY_SIZE + 1); + + // consider a loop unroll here + for (var i = endOfRead; i > endOfPoly; i--) + { + if (sequence[i] != test) + { + return false; + } + } + + return true; + } + } +} From 462e3e3a1e0ea7ffd9e763f77ac43612b0d13756 Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Tue, 24 May 2022 12:27:51 -0700 Subject: [PATCH 12/12] adding n reads to polyx --- Modules/PolyX.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Modules/PolyX.cs b/Modules/PolyX.cs index e382f51..1635f41 100644 --- a/Modules/PolyX.cs +++ b/Modules/PolyX.cs @@ -12,6 +12,8 @@ public class PolyX : IQcModule private readonly ulong[] polyG = new ulong[2]; + private readonly ulong[] polyN = new ulong[2]; + private ulong sequenceCount; private ulong tooShort; @@ -31,8 +33,8 @@ public object Data sequenceCount, tooShort, target = POLY_SIZE, - head = new { a = polyA[0], t = polyT[0], c = polyC[0], g = polyG[0] }, - tail = new { a = polyA[1], t = polyT[1], c = polyC[1], g = polyG[1] }, + head = new { a = polyA[0], t = polyT[0], c = polyC[0], g = polyG[0], n = polyN[0] }, + tail = new { a = polyA[1], t = polyT[1], c = polyC[1], g = polyG[1], n = polyN[1] }, }; } } @@ -58,6 +60,9 @@ public void ProcessSequence(Sequence sequence) if (CheckPolyHead((byte)'G', sequence.Read) == true) polyG[0]++; if (CheckPolyTail((byte)'G', sequence.Read) == true) polyG[1]++; + + if (CheckPolyHead((byte)'N', sequence.Read) == true) polyN[0]++; + if (CheckPolyTail((byte)'N', sequence.Read) == true) polyN[1]++; } public void Reset() @@ -66,6 +71,7 @@ public void Reset() polyT[0] = 0; polyT[1] = 0; polyC[0] = 0; polyC[1] = 0; polyG[0] = 0; polyG[1] = 0; + polyN[0] = 0; polyN[1] = 0; sequenceCount = 0; tooShort = 0;