diff --git a/Interfaces/IQcModule.cs b/Interfaces/IQcModule.cs
index 293be39..a9392eb 100644
--- a/Interfaces/IQcModule.cs
+++ b/Interfaces/IQcModule.cs
@@ -7,6 +7,8 @@ public interface IQcModule
string Description { get; }
+ bool IsEnabledForAll { get; }
+
void ProcessSequence(Sequence sequence);
void Reset();
diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs
index b9625b0..effa7c3 100644
--- a/Models/ReadFlag.cs
+++ b/Models/ReadFlag.cs
@@ -47,6 +47,8 @@ public enum ReadFlag : ushort
///
LastSegment = 128,
+ EmbeddedSegment = FirstSegment | LastSegment,
+
///
/// not primary alignment
///
@@ -65,6 +67,8 @@ public enum ReadFlag : ushort
///
/// supplementary alignment (e.g. aligner specific, could be a portion of a split read or a tied region)
///
- SupplementaryAlignment = 2048
+ SupplementaryAlignment = 2048,
+
+ SecondaryAlignment = NotPrimaryAlignment | SupplementaryAlignment
}
}
\ No newline at end of file
diff --git a/Modules/AlignmentStatistics.cs b/Modules/AlignmentStatistics.cs
index bd69d23..a080c94 100644
--- a/Modules/AlignmentStatistics.cs
+++ b/Modules/AlignmentStatistics.cs
@@ -1,10 +1,15 @@
+using System.Collections.Generic;
+using System.Linq;
+
namespace Ovation.FasterQC.Net
{
public class AlignmentStatistics : IQcModule
{
private ulong sequenceCount;
+ private ulong baseCount;
+
private ulong paired;
private ulong aligned;
@@ -19,30 +24,80 @@ public class AlignmentStatistics : IQcModule
private ulong nextSegmentReverseComplemented;
+ private ulong firstSegment;
+
+ private ulong lastSegment;
+
+ private ulong embeddedSegment;
+
+ private ulong unknownIndex;
+
+ private ulong primaryAlignment;
+
+ private ulong secondaryAlignment;
+
private ulong nonPrimaryAlignment;
private ulong failedQualityChecks;
private ulong opticalDuplicate;
+ private ulong alignedBases;
+
+ private readonly Dictionary readLengthHistogram = new();
+
public string Name => "alignmentStatistics";
+ public bool IsEnabledForAll => true;
+
public string Description => "Calculates alignment statistics for SAM/BAM files";
public void ProcessSequence(Sequence sequence)
{
sequenceCount++;
-
- if ((sequence.ReadFlag & ReadFlag.Paired) != 0) paired++;
- if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) aligned++;
+ baseCount += (ulong)sequence.Read.Length;
+
+ if (readLengthHistogram.ContainsKey(sequence.Read.Length) == false)
+ {
+ readLengthHistogram.Add(sequence.Read.Length, new ReadPair());
+ }
+
+ var readPair = readLengthHistogram[sequence.Read.Length];
+
+ // revisit the logic here based on https://samtools.github.io/hts-specs/SAMv1.pdf
+ if ((sequence.ReadFlag & ReadFlag.Paired) != 0)
+ {
+ paired++;
+ readPair.Paired++;
+ }
+ if ((sequence.ReadFlag & ReadFlag.Aligned) != 0)
+ {
+ aligned++;
+ alignedBases += (ulong)sequence.Read.Length;
+ readPair.AlignedAndPaired++;
+ }
if ((sequence.ReadFlag & ReadFlag.AlignedAndPaired) == ReadFlag.AlignedAndPaired) alignedAndPaired++;
if ((sequence.ReadFlag & ReadFlag.SegmentUnmapped) != 0) segmentUnmapped++;
if ((sequence.ReadFlag & ReadFlag.NextSegmentUnmapped) != 0) nextSegmentUnmapped++;
if ((sequence.ReadFlag & ReadFlag.ReverseComplemented) != 0) reverseComplemented++;
if ((sequence.ReadFlag & ReadFlag.NextSegmentReverseComplemented) != 0) nextSegmentReverseComplemented++;
+
+ if ((sequence.ReadFlag & ReadFlag.FirstSegment) != 0) firstSegment++;
+ if ((sequence.ReadFlag & ReadFlag.LastSegment) != 0) lastSegment++;
+ if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == ReadFlag.EmbeddedSegment) embeddedSegment++;
+ if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == 0) unknownIndex++;
+
if ((sequence.ReadFlag & ReadFlag.NotPrimaryAlignment) != 0) nonPrimaryAlignment++;
if ((sequence.ReadFlag & ReadFlag.FailedQualityChecks) != 0) failedQualityChecks++;
if ((sequence.ReadFlag & ReadFlag.OpticalDuplicate) != 0) opticalDuplicate++;
+ if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == 0)
+ {
+ primaryAlignment++;
+ }
+ if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == ReadFlag.SecondaryAlignment)
+ {
+ secondaryAlignment++;
+ }
}
public void Reset()
@@ -50,19 +105,62 @@ public void Reset()
sequenceCount = 0;
}
- public object Data => new
+ public object Data
{
- sequenceCount,
- paired,
- aligned,
- alignedAndPaired,
- segmentUnmapped,
- nextSegmentUnmapped,
- reverseComplemented,
- nextSegmentReverseComplemented,
- nonPrimaryAlignment,
- failedQualityChecks,
- opticalDuplicate
- };
+ get
+ {
+ var minReadLength = readLengthHistogram.Keys.Min();
+ var maxReadLength = readLengthHistogram.Keys.Max();
+
+ for (var readLength = minReadLength; readLength < maxReadLength; readLength++)
+ {
+ if (readLengthHistogram.ContainsKey(readLength) == false)
+ {
+ readLengthHistogram.Add(readLength, new ReadPair());
+ }
+ }
+
+ return new
+ {
+ sequenceCount,
+ paired,
+ aligned,
+ alignedAndPaired,
+ segmentUnmapped,
+ nextSegmentUnmapped,
+ reverseComplemented,
+ nextSegmentReverseComplemented,
+ firstSegment,
+ lastSegment,
+ embeddedSegment,
+ unknownIndex,
+ primaryAlignment,
+ secondaryAlignment,
+ nonPrimaryAlignment,
+ failedQualityChecks,
+ opticalDuplicate,
+ alignedBases,
+ averageReadLength = (double)baseCount / (double)sequenceCount,
+ histogram = new
+ {
+ minReadLength,
+ maxReadLength,
+ paired = readLengthHistogram
+ .OrderBy(k => k.Key)
+ .Select((k, v) => k.Value.Paired),
+ unpaired = readLengthHistogram
+ .OrderBy(k => k.Key)
+ .Select((k, v) => k.Value.AlignedAndPaired)
+ }
+ };
+ }
+ }
+
+ class ReadPair
+ {
+ public ulong Paired { get; set; }
+
+ public ulong AlignedAndPaired { get; set; }
+ }
}
}
\ No newline at end of file
diff --git a/Modules/BasicStatistics.cs b/Modules/BasicStatistics.cs
index db24e2e..8761a0c 100644
--- a/Modules/BasicStatistics.cs
+++ b/Modules/BasicStatistics.cs
@@ -24,6 +24,8 @@ public class BasicStatistics : IQcModule
public string Description => "Calculates basic quality statistics";
+ public bool IsEnabledForAll => true;
+
public void ProcessSequence(Sequence sequence)
{
var sequenceLength = sequence.Read.Length;
diff --git a/Modules/KmerContent.cs b/Modules/KmerContent.cs
index be701dd..315d004 100644
--- a/Modules/KmerContent.cs
+++ b/Modules/KmerContent.cs
@@ -22,6 +22,8 @@ public class KMerContent : IQcModule
public string Description => "Computes 4-mer counts across all sequences";
+ public bool IsEnabledForAll => true;
+
public object Data
{
get
diff --git a/Modules/MeanQualityDistribution.cs b/Modules/MeanQualityDistribution.cs
index 25f374e..ee4f9b5 100644
--- a/Modules/MeanQualityDistribution.cs
+++ b/Modules/MeanQualityDistribution.cs
@@ -20,6 +20,8 @@ public class MeanQualityDistribution : IQcModule
public string Description => "Calculates the quality distribution across all sequences";
+ public bool IsEnabledForAll => true;
+
public object Data
{
get
diff --git a/Modules/ModuleFactory.cs b/Modules/ModuleFactory.cs
index 9e8a1d4..dea2b79 100644
--- a/Modules/ModuleFactory.cs
+++ b/Modules/ModuleFactory.cs
@@ -1,35 +1,58 @@
+using System;
using System.Collections.Generic;
using System.Linq;
+using System.Reflection;
using Ovation.FasterQC.Net.Utils;
namespace Ovation.FasterQC.Net.Modules
{
public static class ModuleFactory
{
- private static readonly Dictionary moduleMap = new()
+ private static Dictionary? moduleMap;
+
+ public static Dictionary ModuleMap
{
- ["AlignmentStatistics"] = new AlignmentStatistics(),
- ["BasicStatistics"] = new BasicStatistics(),
- ["KMerContent"] = new KMerContent(),
- ["NCountsAtPosition"] = new NCountsAtPosition(),
- ["PerPositionSequenceContent"] = new PerPositionSequenceContent(),
- ["PerSequenceGcContent"] = new PerSequenceGcContent(),
- ["QualityDistributionByBase"] = new QualityDistributionByBase(),
- ["MeanQualityDistribution"] = new MeanQualityDistribution(),
- ["SequenceLengthDistribution"] = new SequenceLengthDistribution(),
- ["PerPositionQuality"] = new PerPositionQuality(),
- };
+ get
+ {
+ if (moduleMap == null)
+ {
+ moduleMap = new Dictionary();
+
+ var modules = Assembly.GetExecutingAssembly()
+ .GetTypes()
+ .Where(t => string.IsNullOrEmpty(t.Namespace) == false && t.GetInterface(nameof(IQcModule)) != null)
+ .Select(t => Activator.CreateInstance(t))
+ .Cast();
+
+ foreach (var module in modules)
+ {
+ moduleMap.Add(module.GetType().Name, module);
+ }
+ }
+
+ return moduleMap;
+ }
+ }
public static IEnumerable Create(CliOptions settings)
{
if (settings.ModuleNames.Any() == false || settings.ModuleNames.First() == "all")
{
- settings.ModuleNames = moduleMap.Keys;
- return moduleMap.Values;
+ var moduleNames = new List();
+ var modules = new List();
+
+ foreach (var module in ModuleMap.Where(m => m.Value.IsEnabledForAll == true))
+ {
+ moduleNames.Add(module.Key);
+ modules.Add(module.Value);
+ }
+
+ settings.ModuleNames = moduleNames;
+ return modules;
}
else
{
- return settings.ModuleNames.Select(n => moduleMap[n]);
+ return settings.ModuleNames.Select(n => ModuleMap[n]);
}
}
}
diff --git a/Modules/NCountsAtPosition.cs b/Modules/NCountsAtPosition.cs
index d458a3a..91ee16c 100644
--- a/Modules/NCountsAtPosition.cs
+++ b/Modules/NCountsAtPosition.cs
@@ -5,12 +5,15 @@ namespace Ovation.FasterQC.Net
public class NCountsAtPosition : IQcModule
{
private int[] nCounts = Array.Empty();
+
private int[] notNCounts = Array.Empty();
public string Name => "nPercentages";
public string Description => "Calculates N counts at position along sequence";
+ public bool IsEnabledForAll => true;
+
public object Data
{
get
diff --git a/Modules/PerPositionQuality.cs b/Modules/PerPositionQuality.cs
index 88e0e2c..90e1ef9 100644
--- a/Modules/PerPositionQuality.cs
+++ b/Modules/PerPositionQuality.cs
@@ -8,12 +8,15 @@ public class PerPositionQuality : IQcModule
private QualityMetric[] qualities = Array.Empty();
private int minimumReadLength = int.MaxValue;
+
private int maximumReadLength = int.MinValue;
public string Name => "perPositionQuality";
public string Description => "Calculates the per-position quality metrics";
+ public bool IsEnabledForAll => true;
+
public object Data
{
get
diff --git a/Modules/PerPositionSequenceContent.cs b/Modules/PerPositionSequenceContent.cs
index c3f2ed0..e4e213d 100644
--- a/Modules/PerPositionSequenceContent.cs
+++ b/Modules/PerPositionSequenceContent.cs
@@ -6,8 +6,11 @@ namespace Ovation.FasterQC.Net
public class PerPositionSequenceContent : IQcModule
{
private ulong[] aCounts = Array.Empty();
+
private ulong[] cCounts = Array.Empty();
+
private ulong[] tCounts = Array.Empty();
+
private ulong[] gCounts = Array.Empty();
private ulong sequenceCount;
@@ -15,6 +18,8 @@ public class PerPositionSequenceContent : IQcModule
public string Description => "Calculates ATCG counts at position along sequence";
+ public bool IsEnabledForAll => true;
+
public object Data
{
get
diff --git a/Modules/PerSequenceGcContent.cs b/Modules/PerSequenceGcContent.cs
index 1fd9687..9ba1f09 100644
--- a/Modules/PerSequenceGcContent.cs
+++ b/Modules/PerSequenceGcContent.cs
@@ -6,12 +6,15 @@ namespace Ovation.FasterQC.Net
public class PerSequenceGcContent : IQcModule
{
private ulong[] gcCounts = Array.Empty();
+
private ulong sequenceCount;
public string Name => "gcDistribution";
public string Description => "Distribution of GC content percentages";
+ public bool IsEnabledForAll => true;
+
public object Data => gcCounts.Select(a => Math.Round((double)a / (double)sequenceCount * 100.0, 3));
public void ProcessSequence(Sequence sequence)
diff --git a/Modules/QualityDistributionByBase.cs b/Modules/QualityDistributionByBase.cs
index 285ce80..7b26f49 100644
--- a/Modules/QualityDistributionByBase.cs
+++ b/Modules/QualityDistributionByBase.cs
@@ -8,8 +8,11 @@ public class QualityDistributionByBase : IQcModule
private const byte ILLUMINA_BASE_ADJUSTMENT = 33;
private readonly ulong[] aQuality = new ulong[128];
+
private readonly ulong[] cQuality = new ulong[128];
+
private readonly ulong[] tQuality = new ulong[128];
+
private readonly ulong[] gQuality = new ulong[128];
private byte lowestScore = byte.MaxValue;
@@ -20,6 +23,8 @@ public class QualityDistributionByBase : IQcModule
public string Description => "Calculates the quality distribution across all sequences";
+ public bool IsEnabledForAll => true;
+
public object Data
{
get
diff --git a/Modules/SequenceLengthDistribution.cs b/Modules/SequenceLengthDistribution.cs
index 27547f2..9fc5412 100644
--- a/Modules/SequenceLengthDistribution.cs
+++ b/Modules/SequenceLengthDistribution.cs
@@ -6,6 +6,7 @@ namespace Ovation.FasterQC.Net
public class SequenceLengthDistribution : IQcModule
{
private int minimumReadLength = int.MaxValue;
+
private int maximumReadLength = int.MinValue;
private readonly IDictionary lengths = new Dictionary();
@@ -14,6 +15,8 @@ public class SequenceLengthDistribution : IQcModule
public string Description => "Calculates the sequence length distributions";
+ public bool IsEnabledForAll => true;
+
public object Data
{
get
diff --git a/Program.cs b/Program.cs
index 47d28a2..d1d5f20 100644
--- a/Program.cs
+++ b/Program.cs
@@ -1,8 +1,10 @@
using System;
using System.Collections.Generic;
using System.IO;
+using System.Text;
using System.Text.Json;
using CommandLine;
+using CommandLine.Text;
using Ovation.FasterQC.Net.Modules;
using Ovation.FasterQC.Net.Readers;
using Ovation.FasterQC.Net.Utils;
@@ -31,12 +33,37 @@ private static void Main(string[] args)
}
);
- _ = parser.ParseArguments(args)
- .WithParsed(o =>
+ var parserResult = parser.ParseArguments(args);
+
+ parserResult.WithParsed(o =>
+ {
+ Settings = o;
+ new Program().Run();
+ })
+ .WithNotParsed(errs => DisplayHelp(parserResult, errs));
+ }
+
+ static void DisplayHelp(ParserResult result, IEnumerable _)
+ {
+ var sb = new StringBuilder("List of available modules for --modules:").AppendLine();
+ foreach (var module in ModuleFactory.ModuleMap)
+ {
+ sb.AppendLine($"\t{module.Key} -> {module.Value.Description}");
+ }
+
+ var helpText = HelpText.AutoBuild(result,
+ h =>
{
- Settings = o;
- new Program().Run();
- });
+ h.AdditionalNewLineAfterOption = false;
+ h.MaximumDisplayWidth = 120;
+ h.AddPostOptionsText(sb.ToString());
+
+ return HelpText.DefaultParsingErrorsHandler(result, h);
+ },
+ e => e
+ );
+
+ Console.Error.WriteLine(helpText);
}
private void Run()
@@ -49,7 +76,7 @@ private void Run()
On(Settings.ShowProgress, () => progressBar = new TimedSequenceProgressBar(sequenceReader));
On(Settings.Verbose, () => Console.Error.WriteLine($"Processing {Settings.InputFilename}..."));
- while (sequenceReader.ReadSequence(out Sequence? sequence) && sequenceReader.SequencesRead < Settings.ReadLimit)
+ while (sequenceReader.SequencesRead < Settings.ReadLimit && sequenceReader.ReadSequence(out Sequence? sequence))
{
ArgumentNullException.ThrowIfNull(sequence);
@@ -70,9 +97,13 @@ private void Run()
Dictionary? results = new()
{
- ["_modules"] = Settings.ModuleNames,
- ["_inputFilename"] = Settings.InputFilename,
- ["_outputFilename"] = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename,
+ ["_metadata"] = new
+ {
+ _modules = Settings.ModuleNames,
+ _inputFilename = Settings.InputFilename,
+ _outputFilename = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename,
+ _sequences = sequenceReader.SequencesRead
+ }
};
foreach (IQcModule? module in modules)