From 6f79480bba2f910bbc0b52187b98a0f56e10bb4e Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 15:17:42 -0700 Subject: [PATCH 1/5] making module factory dynamic --- Interfaces/IQcModule.cs | 2 + Modules/AlignmentStatistics.cs | 48 ++++++++++++++++++++++-- Modules/BasicStatistics.cs | 2 + Modules/KmerContent.cs | 2 + Modules/MeanQualityDistribution.cs | 2 + Modules/ModuleFactory.cs | 53 +++++++++++++++++++-------- Modules/NCountsAtPosition.cs | 3 ++ Modules/PerPositionQuality.cs | 3 ++ Modules/PerPositionSequenceContent.cs | 5 +++ Modules/PerSequenceGcContent.cs | 3 ++ Modules/QualityDistributionByBase.cs | 5 +++ Modules/SequenceLengthDistribution.cs | 3 ++ Program.cs | 12 ++++-- 13 files changed, 120 insertions(+), 23 deletions(-) diff --git a/Interfaces/IQcModule.cs b/Interfaces/IQcModule.cs index 293be39..a9392eb 100644 --- a/Interfaces/IQcModule.cs +++ b/Interfaces/IQcModule.cs @@ -7,6 +7,8 @@ public interface IQcModule string Description { get; } + bool IsEnabledForAll { get; } + void ProcessSequence(Sequence sequence); void Reset(); diff --git a/Modules/AlignmentStatistics.cs b/Modules/AlignmentStatistics.cs index bd69d23..ae9b510 100644 --- a/Modules/AlignmentStatistics.cs +++ b/Modules/AlignmentStatistics.cs @@ -1,10 +1,15 @@ +using System.Collections.Generic; +using System.Linq; + namespace Ovation.FasterQC.Net { public class AlignmentStatistics : IQcModule { private ulong sequenceCount; + private ulong baseCount; + private ulong paired; private ulong aligned; @@ -25,16 +30,39 @@ public class AlignmentStatistics : IQcModule private ulong opticalDuplicate; + private ulong alignedBases; + + private readonly Dictionary readLengthHistogram = new(); + public string Name => "alignmentStatistics"; + public bool IsEnabledForAll => true; + public string Description => "Calculates alignment statistics for SAM/BAM files"; public void ProcessSequence(Sequence sequence) { sequenceCount++; - - if ((sequence.ReadFlag & ReadFlag.Paired) != 0) paired++; - if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) aligned++; + baseCount += (ulong)sequence.Read.Length; + + if (readLengthHistogram.ContainsKey(sequence.Read.Length) == false) + { + readLengthHistogram.Add(sequence.Read.Length, new ReadPair()); + } + + var readPair = readLengthHistogram[sequence.Read.Length]; + + if ((sequence.ReadFlag & ReadFlag.Paired) != 0) + { + paired++; + readPair.Paired++; + } + if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) + { + aligned++; + alignedBases += (ulong)sequence.Read.Length; + readPair.AlignedAndPaired++; + } if ((sequence.ReadFlag & ReadFlag.AlignedAndPaired) == ReadFlag.AlignedAndPaired) alignedAndPaired++; if ((sequence.ReadFlag & ReadFlag.SegmentUnmapped) != 0) segmentUnmapped++; if ((sequence.ReadFlag & ReadFlag.NextSegmentUnmapped) != 0) nextSegmentUnmapped++; @@ -62,7 +90,19 @@ public void Reset() nextSegmentReverseComplemented, nonPrimaryAlignment, failedQualityChecks, - opticalDuplicate + opticalDuplicate, + alignedBases, + averageReadLength = (double)baseCount / (double)sequenceCount, + histogram = readLengthHistogram + .Select((k, v) => new ulong[] { (ulong)k.Key, k.Value.Paired, k.Value.AlignedAndPaired }) + .OrderBy(a => a[0]) }; + + class ReadPair + { + public ulong AlignedAndPaired { get; set; } + + public ulong Paired { get; set; } + } } } \ No newline at end of file diff --git a/Modules/BasicStatistics.cs b/Modules/BasicStatistics.cs index db24e2e..8761a0c 100644 --- a/Modules/BasicStatistics.cs +++ b/Modules/BasicStatistics.cs @@ -24,6 +24,8 @@ public class BasicStatistics : IQcModule public string Description => "Calculates basic quality statistics"; + public bool IsEnabledForAll => true; + public void ProcessSequence(Sequence sequence) { var sequenceLength = sequence.Read.Length; diff --git a/Modules/KmerContent.cs b/Modules/KmerContent.cs index be701dd..315d004 100644 --- a/Modules/KmerContent.cs +++ b/Modules/KmerContent.cs @@ -22,6 +22,8 @@ public class KMerContent : IQcModule public string Description => "Computes 4-mer counts across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/MeanQualityDistribution.cs b/Modules/MeanQualityDistribution.cs index 25f374e..ee4f9b5 100644 --- a/Modules/MeanQualityDistribution.cs +++ b/Modules/MeanQualityDistribution.cs @@ -20,6 +20,8 @@ public class MeanQualityDistribution : IQcModule public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/ModuleFactory.cs b/Modules/ModuleFactory.cs index 9e8a1d4..d3eb538 100644 --- a/Modules/ModuleFactory.cs +++ b/Modules/ModuleFactory.cs @@ -1,35 +1,58 @@ +using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; using Ovation.FasterQC.Net.Utils; namespace Ovation.FasterQC.Net.Modules { public static class ModuleFactory { - private static readonly Dictionary moduleMap = new() + private static Dictionary? moduleMap; + + private static Dictionary ModuleMap { - ["AlignmentStatistics"] = new AlignmentStatistics(), - ["BasicStatistics"] = new BasicStatistics(), - ["KMerContent"] = new KMerContent(), - ["NCountsAtPosition"] = new NCountsAtPosition(), - ["PerPositionSequenceContent"] = new PerPositionSequenceContent(), - ["PerSequenceGcContent"] = new PerSequenceGcContent(), - ["QualityDistributionByBase"] = new QualityDistributionByBase(), - ["MeanQualityDistribution"] = new MeanQualityDistribution(), - ["SequenceLengthDistribution"] = new SequenceLengthDistribution(), - ["PerPositionQuality"] = new PerPositionQuality(), - }; + get + { + if (moduleMap == null) + { + moduleMap = new Dictionary(); + + var modules = Assembly.GetExecutingAssembly() + .GetTypes() + .Where(t => string.IsNullOrEmpty(t.Namespace) == false && t.GetInterface(nameof(IQcModule)) != null) + .Select(t => Activator.CreateInstance(t)) + .Cast(); + + foreach (var module in modules) + { + moduleMap.Add(module.GetType().Name, module); + } + } + + return moduleMap; + } + } public static IEnumerable Create(CliOptions settings) { if (settings.ModuleNames.Any() == false || settings.ModuleNames.First() == "all") { - settings.ModuleNames = moduleMap.Keys; - return moduleMap.Values; + var moduleNames = new List(); + var modules = new List(); + + foreach (var module in ModuleMap.Where(m => m.Value.IsEnabledForAll == true)) + { + moduleNames.Add(module.Key); + modules.Add(module.Value); + } + + settings.ModuleNames = moduleNames; + return modules; } else { - return settings.ModuleNames.Select(n => moduleMap[n]); + return settings.ModuleNames.Select(n => ModuleMap[n]); } } } diff --git a/Modules/NCountsAtPosition.cs b/Modules/NCountsAtPosition.cs index d458a3a..91ee16c 100644 --- a/Modules/NCountsAtPosition.cs +++ b/Modules/NCountsAtPosition.cs @@ -5,12 +5,15 @@ namespace Ovation.FasterQC.Net public class NCountsAtPosition : IQcModule { private int[] nCounts = Array.Empty(); + private int[] notNCounts = Array.Empty(); public string Name => "nPercentages"; public string Description => "Calculates N counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionQuality.cs b/Modules/PerPositionQuality.cs index 88e0e2c..90e1ef9 100644 --- a/Modules/PerPositionQuality.cs +++ b/Modules/PerPositionQuality.cs @@ -8,12 +8,15 @@ public class PerPositionQuality : IQcModule private QualityMetric[] qualities = Array.Empty(); private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; public string Name => "perPositionQuality"; public string Description => "Calculates the per-position quality metrics"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionSequenceContent.cs b/Modules/PerPositionSequenceContent.cs index c3f2ed0..e4e213d 100644 --- a/Modules/PerPositionSequenceContent.cs +++ b/Modules/PerPositionSequenceContent.cs @@ -6,8 +6,11 @@ namespace Ovation.FasterQC.Net public class PerPositionSequenceContent : IQcModule { private ulong[] aCounts = Array.Empty(); + private ulong[] cCounts = Array.Empty(); + private ulong[] tCounts = Array.Empty(); + private ulong[] gCounts = Array.Empty(); private ulong sequenceCount; @@ -15,6 +18,8 @@ public class PerPositionSequenceContent : IQcModule public string Description => "Calculates ATCG counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerSequenceGcContent.cs b/Modules/PerSequenceGcContent.cs index 1fd9687..9ba1f09 100644 --- a/Modules/PerSequenceGcContent.cs +++ b/Modules/PerSequenceGcContent.cs @@ -6,12 +6,15 @@ namespace Ovation.FasterQC.Net public class PerSequenceGcContent : IQcModule { private ulong[] gcCounts = Array.Empty(); + private ulong sequenceCount; public string Name => "gcDistribution"; public string Description => "Distribution of GC content percentages"; + public bool IsEnabledForAll => true; + public object Data => gcCounts.Select(a => Math.Round((double)a / (double)sequenceCount * 100.0, 3)); public void ProcessSequence(Sequence sequence) diff --git a/Modules/QualityDistributionByBase.cs b/Modules/QualityDistributionByBase.cs index 285ce80..7b26f49 100644 --- a/Modules/QualityDistributionByBase.cs +++ b/Modules/QualityDistributionByBase.cs @@ -8,8 +8,11 @@ public class QualityDistributionByBase : IQcModule private const byte ILLUMINA_BASE_ADJUSTMENT = 33; private readonly ulong[] aQuality = new ulong[128]; + private readonly ulong[] cQuality = new ulong[128]; + private readonly ulong[] tQuality = new ulong[128]; + private readonly ulong[] gQuality = new ulong[128]; private byte lowestScore = byte.MaxValue; @@ -20,6 +23,8 @@ public class QualityDistributionByBase : IQcModule public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/SequenceLengthDistribution.cs b/Modules/SequenceLengthDistribution.cs index 27547f2..9fc5412 100644 --- a/Modules/SequenceLengthDistribution.cs +++ b/Modules/SequenceLengthDistribution.cs @@ -6,6 +6,7 @@ namespace Ovation.FasterQC.Net public class SequenceLengthDistribution : IQcModule { private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; private readonly IDictionary lengths = new Dictionary(); @@ -14,6 +15,8 @@ public class SequenceLengthDistribution : IQcModule public string Description => "Calculates the sequence length distributions"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Program.cs b/Program.cs index 47d28a2..51626be 100644 --- a/Program.cs +++ b/Program.cs @@ -49,7 +49,7 @@ private void Run() On(Settings.ShowProgress, () => progressBar = new TimedSequenceProgressBar(sequenceReader)); On(Settings.Verbose, () => Console.Error.WriteLine($"Processing {Settings.InputFilename}...")); - while (sequenceReader.ReadSequence(out Sequence? sequence) && sequenceReader.SequencesRead < Settings.ReadLimit) + while (sequenceReader.SequencesRead < Settings.ReadLimit && sequenceReader.ReadSequence(out Sequence? sequence)) { ArgumentNullException.ThrowIfNull(sequence); @@ -70,9 +70,13 @@ private void Run() Dictionary? results = new() { - ["_modules"] = Settings.ModuleNames, - ["_inputFilename"] = Settings.InputFilename, - ["_outputFilename"] = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + ["_metadata"] = new + { + _modules = Settings.ModuleNames, + _inputFilename = Settings.InputFilename, + _outputFilename = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + _sequences = sequenceReader.SequencesRead + } }; foreach (IQcModule? module in modules) From d9562ce2dd8f69c1ea8ba02ac00fd913006283ca Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 15:35:04 -0700 Subject: [PATCH 2/5] make module list available via --help --- Modules/ModuleFactory.cs | 2 +- Program.cs | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/Modules/ModuleFactory.cs b/Modules/ModuleFactory.cs index d3eb538..dea2b79 100644 --- a/Modules/ModuleFactory.cs +++ b/Modules/ModuleFactory.cs @@ -10,7 +10,7 @@ public static class ModuleFactory { private static Dictionary? moduleMap; - private static Dictionary ModuleMap + public static Dictionary ModuleMap { get { diff --git a/Program.cs b/Program.cs index 51626be..de896a9 100644 --- a/Program.cs +++ b/Program.cs @@ -1,8 +1,10 @@ using System; using System.Collections.Generic; using System.IO; +using System.Text; using System.Text.Json; using CommandLine; +using CommandLine.Text; using Ovation.FasterQC.Net.Modules; using Ovation.FasterQC.Net.Readers; using Ovation.FasterQC.Net.Utils; @@ -31,12 +33,36 @@ private static void Main(string[] args) } ); - _ = parser.ParseArguments(args) - .WithParsed(o => - { - Settings = o; - new Program().Run(); - }); + var parserResult = parser.ParseArguments(args); + + parserResult.WithParsed(o => + { + Settings = o; + new Program().Run(); + }) + .WithNotParsed(errs => DisplayHelp(parserResult, errs)); + } + + static void DisplayHelp(ParserResult result, IEnumerable errs) + { + var moduleList = ModuleFactory.ModuleMap; + var sb = new StringBuilder("List of available modules:").AppendLine(); + + foreach (var module in moduleList) + { + sb.AppendLine($"\t{module.Key} -> {module.Value.Description}"); + } + + var helpText = HelpText.AutoBuild(result, h => + { + h.AdditionalNewLineAfterOption = false; + h.MaximumDisplayWidth = 120; + h.AddPostOptionsText(sb.ToString()); + + return HelpText.DefaultParsingErrorsHandler(result, h); + }, e => e); + + Console.Error.WriteLine(helpText); } private void Run() From 51c11de30e2ed7c588eb2a57e1f83de615fadd8e Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 15:36:38 -0700 Subject: [PATCH 3/5] make module list available via --help --- Program.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Program.cs b/Program.cs index de896a9..4368556 100644 --- a/Program.cs +++ b/Program.cs @@ -43,10 +43,10 @@ private static void Main(string[] args) .WithNotParsed(errs => DisplayHelp(parserResult, errs)); } - static void DisplayHelp(ParserResult result, IEnumerable errs) + static void DisplayHelp(ParserResult result, IEnumerable _) { var moduleList = ModuleFactory.ModuleMap; - var sb = new StringBuilder("List of available modules:").AppendLine(); + var sb = new StringBuilder("List of available modules for --modules:").AppendLine(); foreach (var module in moduleList) { From 3849d00b131fec2a2a8b9ff98324792352f2dfe4 Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 15:39:18 -0700 Subject: [PATCH 4/5] make module list available via --help --- Program.cs | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Program.cs b/Program.cs index 4368556..d1d5f20 100644 --- a/Program.cs +++ b/Program.cs @@ -45,22 +45,23 @@ private static void Main(string[] args) static void DisplayHelp(ParserResult result, IEnumerable _) { - var moduleList = ModuleFactory.ModuleMap; var sb = new StringBuilder("List of available modules for --modules:").AppendLine(); - - foreach (var module in moduleList) + foreach (var module in ModuleFactory.ModuleMap) { sb.AppendLine($"\t{module.Key} -> {module.Value.Description}"); } - var helpText = HelpText.AutoBuild(result, h => - { - h.AdditionalNewLineAfterOption = false; - h.MaximumDisplayWidth = 120; - h.AddPostOptionsText(sb.ToString()); - - return HelpText.DefaultParsingErrorsHandler(result, h); - }, e => e); + var helpText = HelpText.AutoBuild(result, + h => + { + h.AdditionalNewLineAfterOption = false; + h.MaximumDisplayWidth = 120; + h.AddPostOptionsText(sb.ToString()); + + return HelpText.DefaultParsingErrorsHandler(result, h); + }, + e => e + ); Console.Error.WriteLine(helpText); } From 9db0c6409fc9c322c2db2f532057a4f247877e5c Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Thu, 19 May 2022 08:08:25 -0700 Subject: [PATCH 5/5] additional flag cleanup --- Models/ReadFlag.cs | 6 ++- Modules/AlignmentStatistics.cs | 98 +++++++++++++++++++++++++++------- 2 files changed, 83 insertions(+), 21 deletions(-) diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs index b9625b0..effa7c3 100644 --- a/Models/ReadFlag.cs +++ b/Models/ReadFlag.cs @@ -47,6 +47,8 @@ public enum ReadFlag : ushort /// LastSegment = 128, + EmbeddedSegment = FirstSegment | LastSegment, + /// /// not primary alignment /// @@ -65,6 +67,8 @@ public enum ReadFlag : ushort /// /// supplementary alignment (e.g. aligner specific, could be a portion of a split read or a tied region) /// - SupplementaryAlignment = 2048 + SupplementaryAlignment = 2048, + + SecondaryAlignment = NotPrimaryAlignment | SupplementaryAlignment } } \ No newline at end of file diff --git a/Modules/AlignmentStatistics.cs b/Modules/AlignmentStatistics.cs index ae9b510..a080c94 100644 --- a/Modules/AlignmentStatistics.cs +++ b/Modules/AlignmentStatistics.cs @@ -24,6 +24,18 @@ public class AlignmentStatistics : IQcModule private ulong nextSegmentReverseComplemented; + private ulong firstSegment; + + private ulong lastSegment; + + private ulong embeddedSegment; + + private ulong unknownIndex; + + private ulong primaryAlignment; + + private ulong secondaryAlignment; + private ulong nonPrimaryAlignment; private ulong failedQualityChecks; @@ -52,6 +64,7 @@ public void ProcessSequence(Sequence sequence) var readPair = readLengthHistogram[sequence.Read.Length]; + // revisit the logic here based on https://samtools.github.io/hts-specs/SAMv1.pdf if ((sequence.ReadFlag & ReadFlag.Paired) != 0) { paired++; @@ -68,9 +81,23 @@ public void ProcessSequence(Sequence sequence) if ((sequence.ReadFlag & ReadFlag.NextSegmentUnmapped) != 0) nextSegmentUnmapped++; if ((sequence.ReadFlag & ReadFlag.ReverseComplemented) != 0) reverseComplemented++; if ((sequence.ReadFlag & ReadFlag.NextSegmentReverseComplemented) != 0) nextSegmentReverseComplemented++; + + if ((sequence.ReadFlag & ReadFlag.FirstSegment) != 0) firstSegment++; + if ((sequence.ReadFlag & ReadFlag.LastSegment) != 0) lastSegment++; + if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == ReadFlag.EmbeddedSegment) embeddedSegment++; + if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == 0) unknownIndex++; + if ((sequence.ReadFlag & ReadFlag.NotPrimaryAlignment) != 0) nonPrimaryAlignment++; if ((sequence.ReadFlag & ReadFlag.FailedQualityChecks) != 0) failedQualityChecks++; if ((sequence.ReadFlag & ReadFlag.OpticalDuplicate) != 0) opticalDuplicate++; + if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == 0) + { + primaryAlignment++; + } + if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == ReadFlag.SecondaryAlignment) + { + secondaryAlignment++; + } } public void Reset() @@ -78,31 +105,62 @@ public void Reset() sequenceCount = 0; } - public object Data => new + public object Data { - sequenceCount, - paired, - aligned, - alignedAndPaired, - segmentUnmapped, - nextSegmentUnmapped, - reverseComplemented, - nextSegmentReverseComplemented, - nonPrimaryAlignment, - failedQualityChecks, - opticalDuplicate, - alignedBases, - averageReadLength = (double)baseCount / (double)sequenceCount, - histogram = readLengthHistogram - .Select((k, v) => new ulong[] { (ulong)k.Key, k.Value.Paired, k.Value.AlignedAndPaired }) - .OrderBy(a => a[0]) - }; + get + { + var minReadLength = readLengthHistogram.Keys.Min(); + var maxReadLength = readLengthHistogram.Keys.Max(); + + for (var readLength = minReadLength; readLength < maxReadLength; readLength++) + { + if (readLengthHistogram.ContainsKey(readLength) == false) + { + readLengthHistogram.Add(readLength, new ReadPair()); + } + } + + return new + { + sequenceCount, + paired, + aligned, + alignedAndPaired, + segmentUnmapped, + nextSegmentUnmapped, + reverseComplemented, + nextSegmentReverseComplemented, + firstSegment, + lastSegment, + embeddedSegment, + unknownIndex, + primaryAlignment, + secondaryAlignment, + nonPrimaryAlignment, + failedQualityChecks, + opticalDuplicate, + alignedBases, + averageReadLength = (double)baseCount / (double)sequenceCount, + histogram = new + { + minReadLength, + maxReadLength, + paired = readLengthHistogram + .OrderBy(k => k.Key) + .Select((k, v) => k.Value.Paired), + unpaired = readLengthHistogram + .OrderBy(k => k.Key) + .Select((k, v) => k.Value.AlignedAndPaired) + } + }; + } + } class ReadPair { - public ulong AlignedAndPaired { get; set; } - public ulong Paired { get; set; } + + public ulong AlignedAndPaired { get; set; } } } } \ No newline at end of file