diff --git a/Interfaces/IQcModule.cs b/Interfaces/IQcModule.cs index 293be39..a9392eb 100644 --- a/Interfaces/IQcModule.cs +++ b/Interfaces/IQcModule.cs @@ -7,6 +7,8 @@ public interface IQcModule string Description { get; } + bool IsEnabledForAll { get; } + void ProcessSequence(Sequence sequence); void Reset(); diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs index b9625b0..effa7c3 100644 --- a/Models/ReadFlag.cs +++ b/Models/ReadFlag.cs @@ -47,6 +47,8 @@ public enum ReadFlag : ushort /// LastSegment = 128, + EmbeddedSegment = FirstSegment | LastSegment, + /// /// not primary alignment /// @@ -65,6 +67,8 @@ public enum ReadFlag : ushort /// /// supplementary alignment (e.g. aligner specific, could be a portion of a split read or a tied region) /// - SupplementaryAlignment = 2048 + SupplementaryAlignment = 2048, + + SecondaryAlignment = NotPrimaryAlignment | SupplementaryAlignment } } \ No newline at end of file diff --git a/Modules/AlignmentStatistics.cs b/Modules/AlignmentStatistics.cs index bd69d23..a080c94 100644 --- a/Modules/AlignmentStatistics.cs +++ b/Modules/AlignmentStatistics.cs @@ -1,10 +1,15 @@ +using System.Collections.Generic; +using System.Linq; + namespace Ovation.FasterQC.Net { public class AlignmentStatistics : IQcModule { private ulong sequenceCount; + private ulong baseCount; + private ulong paired; private ulong aligned; @@ -19,30 +24,80 @@ public class AlignmentStatistics : IQcModule private ulong nextSegmentReverseComplemented; + private ulong firstSegment; + + private ulong lastSegment; + + private ulong embeddedSegment; + + private ulong unknownIndex; + + private ulong primaryAlignment; + + private ulong secondaryAlignment; + private ulong nonPrimaryAlignment; private ulong failedQualityChecks; private ulong opticalDuplicate; + private ulong alignedBases; + + private readonly Dictionary readLengthHistogram = new(); + public string Name => "alignmentStatistics"; + public bool IsEnabledForAll => true; + public string Description => "Calculates alignment statistics for SAM/BAM files"; public void ProcessSequence(Sequence sequence) { sequenceCount++; - - if ((sequence.ReadFlag & ReadFlag.Paired) != 0) paired++; - if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) aligned++; + baseCount += (ulong)sequence.Read.Length; + + if (readLengthHistogram.ContainsKey(sequence.Read.Length) == false) + { + readLengthHistogram.Add(sequence.Read.Length, new ReadPair()); + } + + var readPair = readLengthHistogram[sequence.Read.Length]; + + // revisit the logic here based on https://samtools.github.io/hts-specs/SAMv1.pdf + if ((sequence.ReadFlag & ReadFlag.Paired) != 0) + { + paired++; + readPair.Paired++; + } + if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) + { + aligned++; + alignedBases += (ulong)sequence.Read.Length; + readPair.AlignedAndPaired++; + } if ((sequence.ReadFlag & ReadFlag.AlignedAndPaired) == ReadFlag.AlignedAndPaired) alignedAndPaired++; if ((sequence.ReadFlag & ReadFlag.SegmentUnmapped) != 0) segmentUnmapped++; if ((sequence.ReadFlag & ReadFlag.NextSegmentUnmapped) != 0) nextSegmentUnmapped++; if ((sequence.ReadFlag & ReadFlag.ReverseComplemented) != 0) reverseComplemented++; if ((sequence.ReadFlag & ReadFlag.NextSegmentReverseComplemented) != 0) nextSegmentReverseComplemented++; + + if ((sequence.ReadFlag & ReadFlag.FirstSegment) != 0) firstSegment++; + if ((sequence.ReadFlag & ReadFlag.LastSegment) != 0) lastSegment++; + if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == ReadFlag.EmbeddedSegment) embeddedSegment++; + if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == 0) unknownIndex++; + if ((sequence.ReadFlag & ReadFlag.NotPrimaryAlignment) != 0) nonPrimaryAlignment++; if ((sequence.ReadFlag & ReadFlag.FailedQualityChecks) != 0) failedQualityChecks++; if ((sequence.ReadFlag & ReadFlag.OpticalDuplicate) != 0) opticalDuplicate++; + if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == 0) + { + primaryAlignment++; + } + if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == ReadFlag.SecondaryAlignment) + { + secondaryAlignment++; + } } public void Reset() @@ -50,19 +105,62 @@ public void Reset() sequenceCount = 0; } - public object Data => new + public object Data { - sequenceCount, - paired, - aligned, - alignedAndPaired, - segmentUnmapped, - nextSegmentUnmapped, - reverseComplemented, - nextSegmentReverseComplemented, - nonPrimaryAlignment, - failedQualityChecks, - opticalDuplicate - }; + get + { + var minReadLength = readLengthHistogram.Keys.Min(); + var maxReadLength = readLengthHistogram.Keys.Max(); + + for (var readLength = minReadLength; readLength < maxReadLength; readLength++) + { + if (readLengthHistogram.ContainsKey(readLength) == false) + { + readLengthHistogram.Add(readLength, new ReadPair()); + } + } + + return new + { + sequenceCount, + paired, + aligned, + alignedAndPaired, + segmentUnmapped, + nextSegmentUnmapped, + reverseComplemented, + nextSegmentReverseComplemented, + firstSegment, + lastSegment, + embeddedSegment, + unknownIndex, + primaryAlignment, + secondaryAlignment, + nonPrimaryAlignment, + failedQualityChecks, + opticalDuplicate, + alignedBases, + averageReadLength = (double)baseCount / (double)sequenceCount, + histogram = new + { + minReadLength, + maxReadLength, + paired = readLengthHistogram + .OrderBy(k => k.Key) + .Select((k, v) => k.Value.Paired), + unpaired = readLengthHistogram + .OrderBy(k => k.Key) + .Select((k, v) => k.Value.AlignedAndPaired) + } + }; + } + } + + class ReadPair + { + public ulong Paired { get; set; } + + public ulong AlignedAndPaired { get; set; } + } } } \ No newline at end of file diff --git a/Modules/BasicStatistics.cs b/Modules/BasicStatistics.cs index db24e2e..8761a0c 100644 --- a/Modules/BasicStatistics.cs +++ b/Modules/BasicStatistics.cs @@ -24,6 +24,8 @@ public class BasicStatistics : IQcModule public string Description => "Calculates basic quality statistics"; + public bool IsEnabledForAll => true; + public void ProcessSequence(Sequence sequence) { var sequenceLength = sequence.Read.Length; diff --git a/Modules/KmerContent.cs b/Modules/KmerContent.cs index be701dd..315d004 100644 --- a/Modules/KmerContent.cs +++ b/Modules/KmerContent.cs @@ -22,6 +22,8 @@ public class KMerContent : IQcModule public string Description => "Computes 4-mer counts across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/MeanQualityDistribution.cs b/Modules/MeanQualityDistribution.cs index 25f374e..ee4f9b5 100644 --- a/Modules/MeanQualityDistribution.cs +++ b/Modules/MeanQualityDistribution.cs @@ -20,6 +20,8 @@ public class MeanQualityDistribution : IQcModule public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/ModuleFactory.cs b/Modules/ModuleFactory.cs index 9e8a1d4..dea2b79 100644 --- a/Modules/ModuleFactory.cs +++ b/Modules/ModuleFactory.cs @@ -1,35 +1,58 @@ +using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; using Ovation.FasterQC.Net.Utils; namespace Ovation.FasterQC.Net.Modules { public static class ModuleFactory { - private static readonly Dictionary moduleMap = new() + private static Dictionary? moduleMap; + + public static Dictionary ModuleMap { - ["AlignmentStatistics"] = new AlignmentStatistics(), - ["BasicStatistics"] = new BasicStatistics(), - ["KMerContent"] = new KMerContent(), - ["NCountsAtPosition"] = new NCountsAtPosition(), - ["PerPositionSequenceContent"] = new PerPositionSequenceContent(), - ["PerSequenceGcContent"] = new PerSequenceGcContent(), - ["QualityDistributionByBase"] = new QualityDistributionByBase(), - ["MeanQualityDistribution"] = new MeanQualityDistribution(), - ["SequenceLengthDistribution"] = new SequenceLengthDistribution(), - ["PerPositionQuality"] = new PerPositionQuality(), - }; + get + { + if (moduleMap == null) + { + moduleMap = new Dictionary(); + + var modules = Assembly.GetExecutingAssembly() + .GetTypes() + .Where(t => string.IsNullOrEmpty(t.Namespace) == false && t.GetInterface(nameof(IQcModule)) != null) + .Select(t => Activator.CreateInstance(t)) + .Cast(); + + foreach (var module in modules) + { + moduleMap.Add(module.GetType().Name, module); + } + } + + return moduleMap; + } + } public static IEnumerable Create(CliOptions settings) { if (settings.ModuleNames.Any() == false || settings.ModuleNames.First() == "all") { - settings.ModuleNames = moduleMap.Keys; - return moduleMap.Values; + var moduleNames = new List(); + var modules = new List(); + + foreach (var module in ModuleMap.Where(m => m.Value.IsEnabledForAll == true)) + { + moduleNames.Add(module.Key); + modules.Add(module.Value); + } + + settings.ModuleNames = moduleNames; + return modules; } else { - return settings.ModuleNames.Select(n => moduleMap[n]); + return settings.ModuleNames.Select(n => ModuleMap[n]); } } } diff --git a/Modules/NCountsAtPosition.cs b/Modules/NCountsAtPosition.cs index d458a3a..91ee16c 100644 --- a/Modules/NCountsAtPosition.cs +++ b/Modules/NCountsAtPosition.cs @@ -5,12 +5,15 @@ namespace Ovation.FasterQC.Net public class NCountsAtPosition : IQcModule { private int[] nCounts = Array.Empty(); + private int[] notNCounts = Array.Empty(); public string Name => "nPercentages"; public string Description => "Calculates N counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionQuality.cs b/Modules/PerPositionQuality.cs index 88e0e2c..90e1ef9 100644 --- a/Modules/PerPositionQuality.cs +++ b/Modules/PerPositionQuality.cs @@ -8,12 +8,15 @@ public class PerPositionQuality : IQcModule private QualityMetric[] qualities = Array.Empty(); private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; public string Name => "perPositionQuality"; public string Description => "Calculates the per-position quality metrics"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionSequenceContent.cs b/Modules/PerPositionSequenceContent.cs index c3f2ed0..e4e213d 100644 --- a/Modules/PerPositionSequenceContent.cs +++ b/Modules/PerPositionSequenceContent.cs @@ -6,8 +6,11 @@ namespace Ovation.FasterQC.Net public class PerPositionSequenceContent : IQcModule { private ulong[] aCounts = Array.Empty(); + private ulong[] cCounts = Array.Empty(); + private ulong[] tCounts = Array.Empty(); + private ulong[] gCounts = Array.Empty(); private ulong sequenceCount; @@ -15,6 +18,8 @@ public class PerPositionSequenceContent : IQcModule public string Description => "Calculates ATCG counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerSequenceGcContent.cs b/Modules/PerSequenceGcContent.cs index 1fd9687..9ba1f09 100644 --- a/Modules/PerSequenceGcContent.cs +++ b/Modules/PerSequenceGcContent.cs @@ -6,12 +6,15 @@ namespace Ovation.FasterQC.Net public class PerSequenceGcContent : IQcModule { private ulong[] gcCounts = Array.Empty(); + private ulong sequenceCount; public string Name => "gcDistribution"; public string Description => "Distribution of GC content percentages"; + public bool IsEnabledForAll => true; + public object Data => gcCounts.Select(a => Math.Round((double)a / (double)sequenceCount * 100.0, 3)); public void ProcessSequence(Sequence sequence) diff --git a/Modules/QualityDistributionByBase.cs b/Modules/QualityDistributionByBase.cs index 285ce80..7b26f49 100644 --- a/Modules/QualityDistributionByBase.cs +++ b/Modules/QualityDistributionByBase.cs @@ -8,8 +8,11 @@ public class QualityDistributionByBase : IQcModule private const byte ILLUMINA_BASE_ADJUSTMENT = 33; private readonly ulong[] aQuality = new ulong[128]; + private readonly ulong[] cQuality = new ulong[128]; + private readonly ulong[] tQuality = new ulong[128]; + private readonly ulong[] gQuality = new ulong[128]; private byte lowestScore = byte.MaxValue; @@ -20,6 +23,8 @@ public class QualityDistributionByBase : IQcModule public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/SequenceLengthDistribution.cs b/Modules/SequenceLengthDistribution.cs index 27547f2..9fc5412 100644 --- a/Modules/SequenceLengthDistribution.cs +++ b/Modules/SequenceLengthDistribution.cs @@ -6,6 +6,7 @@ namespace Ovation.FasterQC.Net public class SequenceLengthDistribution : IQcModule { private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; private readonly IDictionary lengths = new Dictionary(); @@ -14,6 +15,8 @@ public class SequenceLengthDistribution : IQcModule public string Description => "Calculates the sequence length distributions"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Program.cs b/Program.cs index 47d28a2..d1d5f20 100644 --- a/Program.cs +++ b/Program.cs @@ -1,8 +1,10 @@ using System; using System.Collections.Generic; using System.IO; +using System.Text; using System.Text.Json; using CommandLine; +using CommandLine.Text; using Ovation.FasterQC.Net.Modules; using Ovation.FasterQC.Net.Readers; using Ovation.FasterQC.Net.Utils; @@ -31,12 +33,37 @@ private static void Main(string[] args) } ); - _ = parser.ParseArguments(args) - .WithParsed(o => + var parserResult = parser.ParseArguments(args); + + parserResult.WithParsed(o => + { + Settings = o; + new Program().Run(); + }) + .WithNotParsed(errs => DisplayHelp(parserResult, errs)); + } + + static void DisplayHelp(ParserResult result, IEnumerable _) + { + var sb = new StringBuilder("List of available modules for --modules:").AppendLine(); + foreach (var module in ModuleFactory.ModuleMap) + { + sb.AppendLine($"\t{module.Key} -> {module.Value.Description}"); + } + + var helpText = HelpText.AutoBuild(result, + h => { - Settings = o; - new Program().Run(); - }); + h.AdditionalNewLineAfterOption = false; + h.MaximumDisplayWidth = 120; + h.AddPostOptionsText(sb.ToString()); + + return HelpText.DefaultParsingErrorsHandler(result, h); + }, + e => e + ); + + Console.Error.WriteLine(helpText); } private void Run() @@ -49,7 +76,7 @@ private void Run() On(Settings.ShowProgress, () => progressBar = new TimedSequenceProgressBar(sequenceReader)); On(Settings.Verbose, () => Console.Error.WriteLine($"Processing {Settings.InputFilename}...")); - while (sequenceReader.ReadSequence(out Sequence? sequence) && sequenceReader.SequencesRead < Settings.ReadLimit) + while (sequenceReader.SequencesRead < Settings.ReadLimit && sequenceReader.ReadSequence(out Sequence? sequence)) { ArgumentNullException.ThrowIfNull(sequence); @@ -70,9 +97,13 @@ private void Run() Dictionary? results = new() { - ["_modules"] = Settings.ModuleNames, - ["_inputFilename"] = Settings.InputFilename, - ["_outputFilename"] = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + ["_metadata"] = new + { + _modules = Settings.ModuleNames, + _inputFilename = Settings.InputFilename, + _outputFilename = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + _sequences = sequenceReader.SequencesRead + } }; foreach (IQcModule? module in modules)