From 6f79480bba2f910bbc0b52187b98a0f56e10bb4e Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 15:17:42 -0700 Subject: [PATCH] making module factory dynamic --- Interfaces/IQcModule.cs | 2 + Modules/AlignmentStatistics.cs | 48 ++++++++++++++++++++++-- Modules/BasicStatistics.cs | 2 + Modules/KmerContent.cs | 2 + Modules/MeanQualityDistribution.cs | 2 + Modules/ModuleFactory.cs | 53 +++++++++++++++++++-------- Modules/NCountsAtPosition.cs | 3 ++ Modules/PerPositionQuality.cs | 3 ++ Modules/PerPositionSequenceContent.cs | 5 +++ Modules/PerSequenceGcContent.cs | 3 ++ Modules/QualityDistributionByBase.cs | 5 +++ Modules/SequenceLengthDistribution.cs | 3 ++ Program.cs | 12 ++++-- 13 files changed, 120 insertions(+), 23 deletions(-) diff --git a/Interfaces/IQcModule.cs b/Interfaces/IQcModule.cs index 293be39..a9392eb 100644 --- a/Interfaces/IQcModule.cs +++ b/Interfaces/IQcModule.cs @@ -7,6 +7,8 @@ public interface IQcModule string Description { get; } + bool IsEnabledForAll { get; } + void ProcessSequence(Sequence sequence); void Reset(); diff --git a/Modules/AlignmentStatistics.cs b/Modules/AlignmentStatistics.cs index bd69d23..ae9b510 100644 --- a/Modules/AlignmentStatistics.cs +++ b/Modules/AlignmentStatistics.cs @@ -1,10 +1,15 @@ +using System.Collections.Generic; +using System.Linq; + namespace Ovation.FasterQC.Net { public class AlignmentStatistics : IQcModule { private ulong sequenceCount; + private ulong baseCount; + private ulong paired; private ulong aligned; @@ -25,16 +30,39 @@ public class AlignmentStatistics : IQcModule private ulong opticalDuplicate; + private ulong alignedBases; + + private readonly Dictionary readLengthHistogram = new(); + public string Name => "alignmentStatistics"; + public bool IsEnabledForAll => true; + public string Description => "Calculates alignment statistics for SAM/BAM files"; public void ProcessSequence(Sequence sequence) { sequenceCount++; - - if ((sequence.ReadFlag & ReadFlag.Paired) != 0) paired++; - if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) aligned++; + baseCount += (ulong)sequence.Read.Length; + + if (readLengthHistogram.ContainsKey(sequence.Read.Length) == false) + { + readLengthHistogram.Add(sequence.Read.Length, new ReadPair()); + } + + var readPair = readLengthHistogram[sequence.Read.Length]; + + if ((sequence.ReadFlag & ReadFlag.Paired) != 0) + { + paired++; + readPair.Paired++; + } + if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) + { + aligned++; + alignedBases += (ulong)sequence.Read.Length; + readPair.AlignedAndPaired++; + } if ((sequence.ReadFlag & ReadFlag.AlignedAndPaired) == ReadFlag.AlignedAndPaired) alignedAndPaired++; if ((sequence.ReadFlag & ReadFlag.SegmentUnmapped) != 0) segmentUnmapped++; if ((sequence.ReadFlag & ReadFlag.NextSegmentUnmapped) != 0) nextSegmentUnmapped++; @@ -62,7 +90,19 @@ public void Reset() nextSegmentReverseComplemented, nonPrimaryAlignment, failedQualityChecks, - opticalDuplicate + opticalDuplicate, + alignedBases, + averageReadLength = (double)baseCount / (double)sequenceCount, + histogram = readLengthHistogram + .Select((k, v) => new ulong[] { (ulong)k.Key, k.Value.Paired, k.Value.AlignedAndPaired }) + .OrderBy(a => a[0]) }; + + class ReadPair + { + public ulong AlignedAndPaired { get; set; } + + public ulong Paired { get; set; } + } } } \ No newline at end of file diff --git a/Modules/BasicStatistics.cs b/Modules/BasicStatistics.cs index db24e2e..8761a0c 100644 --- a/Modules/BasicStatistics.cs +++ b/Modules/BasicStatistics.cs @@ -24,6 +24,8 @@ public class BasicStatistics : IQcModule public string Description => "Calculates basic quality statistics"; + public bool IsEnabledForAll => true; + public void ProcessSequence(Sequence sequence) { var sequenceLength = sequence.Read.Length; diff --git a/Modules/KmerContent.cs b/Modules/KmerContent.cs index be701dd..315d004 100644 --- a/Modules/KmerContent.cs +++ b/Modules/KmerContent.cs @@ -22,6 +22,8 @@ public class KMerContent : IQcModule public string Description => "Computes 4-mer counts across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/MeanQualityDistribution.cs b/Modules/MeanQualityDistribution.cs index 25f374e..ee4f9b5 100644 --- a/Modules/MeanQualityDistribution.cs +++ b/Modules/MeanQualityDistribution.cs @@ -20,6 +20,8 @@ public class MeanQualityDistribution : IQcModule public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/ModuleFactory.cs b/Modules/ModuleFactory.cs index 9e8a1d4..d3eb538 100644 --- a/Modules/ModuleFactory.cs +++ b/Modules/ModuleFactory.cs @@ -1,35 +1,58 @@ +using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; using Ovation.FasterQC.Net.Utils; namespace Ovation.FasterQC.Net.Modules { public static class ModuleFactory { - private static readonly Dictionary moduleMap = new() + private static Dictionary? moduleMap; + + private static Dictionary ModuleMap { - ["AlignmentStatistics"] = new AlignmentStatistics(), - ["BasicStatistics"] = new BasicStatistics(), - ["KMerContent"] = new KMerContent(), - ["NCountsAtPosition"] = new NCountsAtPosition(), - ["PerPositionSequenceContent"] = new PerPositionSequenceContent(), - ["PerSequenceGcContent"] = new PerSequenceGcContent(), - ["QualityDistributionByBase"] = new QualityDistributionByBase(), - ["MeanQualityDistribution"] = new MeanQualityDistribution(), - ["SequenceLengthDistribution"] = new SequenceLengthDistribution(), - ["PerPositionQuality"] = new PerPositionQuality(), - }; + get + { + if (moduleMap == null) + { + moduleMap = new Dictionary(); + + var modules = Assembly.GetExecutingAssembly() + .GetTypes() + .Where(t => string.IsNullOrEmpty(t.Namespace) == false && t.GetInterface(nameof(IQcModule)) != null) + .Select(t => Activator.CreateInstance(t)) + .Cast(); + + foreach (var module in modules) + { + moduleMap.Add(module.GetType().Name, module); + } + } + + return moduleMap; + } + } public static IEnumerable Create(CliOptions settings) { if (settings.ModuleNames.Any() == false || settings.ModuleNames.First() == "all") { - settings.ModuleNames = moduleMap.Keys; - return moduleMap.Values; + var moduleNames = new List(); + var modules = new List(); + + foreach (var module in ModuleMap.Where(m => m.Value.IsEnabledForAll == true)) + { + moduleNames.Add(module.Key); + modules.Add(module.Value); + } + + settings.ModuleNames = moduleNames; + return modules; } else { - return settings.ModuleNames.Select(n => moduleMap[n]); + return settings.ModuleNames.Select(n => ModuleMap[n]); } } } diff --git a/Modules/NCountsAtPosition.cs b/Modules/NCountsAtPosition.cs index d458a3a..91ee16c 100644 --- a/Modules/NCountsAtPosition.cs +++ b/Modules/NCountsAtPosition.cs @@ -5,12 +5,15 @@ namespace Ovation.FasterQC.Net public class NCountsAtPosition : IQcModule { private int[] nCounts = Array.Empty(); + private int[] notNCounts = Array.Empty(); public string Name => "nPercentages"; public string Description => "Calculates N counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionQuality.cs b/Modules/PerPositionQuality.cs index 88e0e2c..90e1ef9 100644 --- a/Modules/PerPositionQuality.cs +++ b/Modules/PerPositionQuality.cs @@ -8,12 +8,15 @@ public class PerPositionQuality : IQcModule private QualityMetric[] qualities = Array.Empty(); private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; public string Name => "perPositionQuality"; public string Description => "Calculates the per-position quality metrics"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionSequenceContent.cs b/Modules/PerPositionSequenceContent.cs index c3f2ed0..e4e213d 100644 --- a/Modules/PerPositionSequenceContent.cs +++ b/Modules/PerPositionSequenceContent.cs @@ -6,8 +6,11 @@ namespace Ovation.FasterQC.Net public class PerPositionSequenceContent : IQcModule { private ulong[] aCounts = Array.Empty(); + private ulong[] cCounts = Array.Empty(); + private ulong[] tCounts = Array.Empty(); + private ulong[] gCounts = Array.Empty(); private ulong sequenceCount; @@ -15,6 +18,8 @@ public class PerPositionSequenceContent : IQcModule public string Description => "Calculates ATCG counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerSequenceGcContent.cs b/Modules/PerSequenceGcContent.cs index 1fd9687..9ba1f09 100644 --- a/Modules/PerSequenceGcContent.cs +++ b/Modules/PerSequenceGcContent.cs @@ -6,12 +6,15 @@ namespace Ovation.FasterQC.Net public class PerSequenceGcContent : IQcModule { private ulong[] gcCounts = Array.Empty(); + private ulong sequenceCount; public string Name => "gcDistribution"; public string Description => "Distribution of GC content percentages"; + public bool IsEnabledForAll => true; + public object Data => gcCounts.Select(a => Math.Round((double)a / (double)sequenceCount * 100.0, 3)); public void ProcessSequence(Sequence sequence) diff --git a/Modules/QualityDistributionByBase.cs b/Modules/QualityDistributionByBase.cs index 285ce80..7b26f49 100644 --- a/Modules/QualityDistributionByBase.cs +++ b/Modules/QualityDistributionByBase.cs @@ -8,8 +8,11 @@ public class QualityDistributionByBase : IQcModule private const byte ILLUMINA_BASE_ADJUSTMENT = 33; private readonly ulong[] aQuality = new ulong[128]; + private readonly ulong[] cQuality = new ulong[128]; + private readonly ulong[] tQuality = new ulong[128]; + private readonly ulong[] gQuality = new ulong[128]; private byte lowestScore = byte.MaxValue; @@ -20,6 +23,8 @@ public class QualityDistributionByBase : IQcModule public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/SequenceLengthDistribution.cs b/Modules/SequenceLengthDistribution.cs index 27547f2..9fc5412 100644 --- a/Modules/SequenceLengthDistribution.cs +++ b/Modules/SequenceLengthDistribution.cs @@ -6,6 +6,7 @@ namespace Ovation.FasterQC.Net public class SequenceLengthDistribution : IQcModule { private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; private readonly IDictionary lengths = new Dictionary(); @@ -14,6 +15,8 @@ public class SequenceLengthDistribution : IQcModule public string Description => "Calculates the sequence length distributions"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Program.cs b/Program.cs index 47d28a2..51626be 100644 --- a/Program.cs +++ b/Program.cs @@ -49,7 +49,7 @@ private void Run() On(Settings.ShowProgress, () => progressBar = new TimedSequenceProgressBar(sequenceReader)); On(Settings.Verbose, () => Console.Error.WriteLine($"Processing {Settings.InputFilename}...")); - while (sequenceReader.ReadSequence(out Sequence? sequence) && sequenceReader.SequencesRead < Settings.ReadLimit) + while (sequenceReader.SequencesRead < Settings.ReadLimit && sequenceReader.ReadSequence(out Sequence? sequence)) { ArgumentNullException.ThrowIfNull(sequence); @@ -70,9 +70,13 @@ private void Run() Dictionary? results = new() { - ["_modules"] = Settings.ModuleNames, - ["_inputFilename"] = Settings.InputFilename, - ["_outputFilename"] = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + ["_metadata"] = new + { + _modules = Settings.ModuleNames, + _inputFilename = Settings.InputFilename, + _outputFilename = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + _sequences = sequenceReader.SequencesRead + } }; foreach (IQcModule? module in modules)