diff --git a/.vscode/launch.json b/.vscode/launch.json index 6aa53df..2b1df33 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,19 +12,17 @@ // If you have changed target frameworks, make sure to update the program path. "program": "${workspaceFolder}/bin/Debug/net6.0/Ovation.FasterQC.Net.dll", "args": [ - "-v", "-d", "-l", - "1000000", + "10", "-f", - "sam", + "bam", "-i", - "./tmp/in3257_2_S1.sorted.sam", + "./tmp/in3257_2_S1.sorted.bam", "-o", "./tmp/bob.json", "-m", - "BasicStatistics", - "NCountsAtPosition" + "BasicStatistics" ], "cwd": "${workspaceFolder}", // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console diff --git a/Interfaces/IQcModule.cs b/Interfaces/IQcModule.cs index 293be39..a9392eb 100644 --- a/Interfaces/IQcModule.cs +++ b/Interfaces/IQcModule.cs @@ -7,6 +7,8 @@ public interface IQcModule string Description { get; } + bool IsEnabledForAll { get; } + void ProcessSequence(Sequence sequence); void Reset(); diff --git a/Models/BamAlignment.cs b/Models/BamAlignment.cs index 28cdc0f..f87b7cb 100644 --- a/Models/BamAlignment.cs +++ b/Models/BamAlignment.cs @@ -1,4 +1,7 @@ +using System; +using System.ComponentModel; using System.Diagnostics.CodeAnalysis; +using System.Text; namespace Ovation.FasterQC.Net { @@ -37,4 +40,94 @@ public class BamAlignment public byte[] qual { get; set; } = null!; } + + [SuppressMessage("Code style", "IDE1006", Justification = "Names correspond to BAM structure field names")] + public class BamOptionalElement + { + public char[] tag { get; set; } + + public char val_type { get; set; } + + public object value { get; set; } = null!; + + public BamOptionalElement(byte[] block, ref int offset) + { + tag = Encoding.ASCII.GetChars(block, offset, 2); offset += 2; + val_type = Encoding.ASCII.GetChars(block, offset, 1)[0]; offset += 1; + + // consume the rest + switch (val_type) + { + case 'A': offset += 1; break; + + // byte + case 'c': offset += 1; break; + case 'C': offset += 1; break; + + // short + case 's': offset += 2; break; + case 'S': offset += 2; break; + + // int + case 'i': offset += 4; break; + case 'I': offset += 4; break; + + // float + case 'f': offset += 4; break; + + // null-terminated string + case 'Z': + while (block[offset++] != 0) ; + break; + + // null-terminated hex digit pairs + case 'H': + while (block[offset++] != 0) ; + break; + + // array of stuff + case 'B': + var subtype = Encoding.ASCII.GetChars(block, offset, 1)[0]; offset += 1; + var length = BitConverter.ToUInt32(new Span(block, offset, 4)); offset += 4; + + // consume the stuff + for (var element = 0; element < length; element++) + { + switch (subtype) + { + // byte + case 'c': offset += 1; break; + case 'C': offset += 1; break; + + // short + case 's': offset += 2; break; + case 'S': offset += 2; break; + + // int + case 'i': offset += 4; break; + case 'I': offset += 4; break; + + // float + case 'f': offset += 4; break; + } + } + + break; + } + } + + public override string ToString() + { + var sb = new StringBuilder(); + + sb.Append("tag: "); + sb.Append(tag[0]); + sb.Append(tag[1]); + + sb.Append(", type: "); + sb.Append(val_type); + + return sb.ToString(); + } + } } diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs index b9625b0..effa7c3 100644 --- a/Models/ReadFlag.cs +++ b/Models/ReadFlag.cs @@ -47,6 +47,8 @@ public enum ReadFlag : ushort /// LastSegment = 128, + EmbeddedSegment = FirstSegment | LastSegment, + /// /// not primary alignment /// @@ -65,6 +67,8 @@ public enum ReadFlag : ushort /// /// supplementary alignment (e.g. aligner specific, could be a portion of a split read or a tied region) /// - SupplementaryAlignment = 2048 + SupplementaryAlignment = 2048, + + SecondaryAlignment = NotPrimaryAlignment | SupplementaryAlignment } } \ No newline at end of file diff --git a/Modules/AlignmentStatistics.cs b/Modules/AlignmentStatistics.cs index bd69d23..a080c94 100644 --- a/Modules/AlignmentStatistics.cs +++ b/Modules/AlignmentStatistics.cs @@ -1,10 +1,15 @@ +using System.Collections.Generic; +using System.Linq; + namespace Ovation.FasterQC.Net { public class AlignmentStatistics : IQcModule { private ulong sequenceCount; + private ulong baseCount; + private ulong paired; private ulong aligned; @@ -19,30 +24,80 @@ public class AlignmentStatistics : IQcModule private ulong nextSegmentReverseComplemented; + private ulong firstSegment; + + private ulong lastSegment; + + private ulong embeddedSegment; + + private ulong unknownIndex; + + private ulong primaryAlignment; + + private ulong secondaryAlignment; + private ulong nonPrimaryAlignment; private ulong failedQualityChecks; private ulong opticalDuplicate; + private ulong alignedBases; + + private readonly Dictionary readLengthHistogram = new(); + public string Name => "alignmentStatistics"; + public bool IsEnabledForAll => true; + public string Description => "Calculates alignment statistics for SAM/BAM files"; public void ProcessSequence(Sequence sequence) { sequenceCount++; - - if ((sequence.ReadFlag & ReadFlag.Paired) != 0) paired++; - if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) aligned++; + baseCount += (ulong)sequence.Read.Length; + + if (readLengthHistogram.ContainsKey(sequence.Read.Length) == false) + { + readLengthHistogram.Add(sequence.Read.Length, new ReadPair()); + } + + var readPair = readLengthHistogram[sequence.Read.Length]; + + // revisit the logic here based on https://samtools.github.io/hts-specs/SAMv1.pdf + if ((sequence.ReadFlag & ReadFlag.Paired) != 0) + { + paired++; + readPair.Paired++; + } + if ((sequence.ReadFlag & ReadFlag.Aligned) != 0) + { + aligned++; + alignedBases += (ulong)sequence.Read.Length; + readPair.AlignedAndPaired++; + } if ((sequence.ReadFlag & ReadFlag.AlignedAndPaired) == ReadFlag.AlignedAndPaired) alignedAndPaired++; if ((sequence.ReadFlag & ReadFlag.SegmentUnmapped) != 0) segmentUnmapped++; if ((sequence.ReadFlag & ReadFlag.NextSegmentUnmapped) != 0) nextSegmentUnmapped++; if ((sequence.ReadFlag & ReadFlag.ReverseComplemented) != 0) reverseComplemented++; if ((sequence.ReadFlag & ReadFlag.NextSegmentReverseComplemented) != 0) nextSegmentReverseComplemented++; + + if ((sequence.ReadFlag & ReadFlag.FirstSegment) != 0) firstSegment++; + if ((sequence.ReadFlag & ReadFlag.LastSegment) != 0) lastSegment++; + if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == ReadFlag.EmbeddedSegment) embeddedSegment++; + if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == 0) unknownIndex++; + if ((sequence.ReadFlag & ReadFlag.NotPrimaryAlignment) != 0) nonPrimaryAlignment++; if ((sequence.ReadFlag & ReadFlag.FailedQualityChecks) != 0) failedQualityChecks++; if ((sequence.ReadFlag & ReadFlag.OpticalDuplicate) != 0) opticalDuplicate++; + if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == 0) + { + primaryAlignment++; + } + if ((sequence.ReadFlag & ReadFlag.SecondaryAlignment) == ReadFlag.SecondaryAlignment) + { + secondaryAlignment++; + } } public void Reset() @@ -50,19 +105,62 @@ public void Reset() sequenceCount = 0; } - public object Data => new + public object Data { - sequenceCount, - paired, - aligned, - alignedAndPaired, - segmentUnmapped, - nextSegmentUnmapped, - reverseComplemented, - nextSegmentReverseComplemented, - nonPrimaryAlignment, - failedQualityChecks, - opticalDuplicate - }; + get + { + var minReadLength = readLengthHistogram.Keys.Min(); + var maxReadLength = readLengthHistogram.Keys.Max(); + + for (var readLength = minReadLength; readLength < maxReadLength; readLength++) + { + if (readLengthHistogram.ContainsKey(readLength) == false) + { + readLengthHistogram.Add(readLength, new ReadPair()); + } + } + + return new + { + sequenceCount, + paired, + aligned, + alignedAndPaired, + segmentUnmapped, + nextSegmentUnmapped, + reverseComplemented, + nextSegmentReverseComplemented, + firstSegment, + lastSegment, + embeddedSegment, + unknownIndex, + primaryAlignment, + secondaryAlignment, + nonPrimaryAlignment, + failedQualityChecks, + opticalDuplicate, + alignedBases, + averageReadLength = (double)baseCount / (double)sequenceCount, + histogram = new + { + minReadLength, + maxReadLength, + paired = readLengthHistogram + .OrderBy(k => k.Key) + .Select((k, v) => k.Value.Paired), + unpaired = readLengthHistogram + .OrderBy(k => k.Key) + .Select((k, v) => k.Value.AlignedAndPaired) + } + }; + } + } + + class ReadPair + { + public ulong Paired { get; set; } + + public ulong AlignedAndPaired { get; set; } + } } } \ No newline at end of file diff --git a/Modules/BasicStatistics.cs b/Modules/BasicStatistics.cs index db24e2e..81f57e4 100644 --- a/Modules/BasicStatistics.cs +++ b/Modules/BasicStatistics.cs @@ -4,6 +4,8 @@ namespace Ovation.FasterQC.Net { public class BasicStatistics : IQcModule { + private const byte ILLUMINA_BASE_ADJUSTMENT = 33; + private ulong sequenceCount; private int minimumReadLength = int.MaxValue; @@ -24,6 +26,8 @@ public class BasicStatistics : IQcModule public string Description => "Calculates basic quality statistics"; + public bool IsEnabledForAll => true; + public void ProcessSequence(Sequence sequence) { var sequenceLength = sequence.Read.Length; @@ -85,7 +89,7 @@ public void Reset() nCount, xCount, - minimumQuality, + minimumQuality = Math.Min(minimumQuality - ILLUMINA_BASE_ADJUSTMENT, 0), gcContent = Math.Round((double)(cCount + gCount) / (double)totalBases * 100.0, 3), diff --git a/Modules/KmerContent.cs b/Modules/KmerContent.cs index be701dd..315d004 100644 --- a/Modules/KmerContent.cs +++ b/Modules/KmerContent.cs @@ -22,6 +22,8 @@ public class KMerContent : IQcModule public string Description => "Computes 4-mer counts across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/MeanQualityDistribution.cs b/Modules/MeanQualityDistribution.cs index 25f374e..e4a0358 100644 --- a/Modules/MeanQualityDistribution.cs +++ b/Modules/MeanQualityDistribution.cs @@ -12,23 +12,25 @@ public class MeanQualityDistribution : IQcModule // position and fix it when someone asks private readonly ulong[] qualityScores = new ulong[128]; - private byte lowestScore = byte.MaxValue; + private uint lowestMean = uint.MaxValue; - private byte highestScore = byte.MinValue; + private uint highestMean = uint.MinValue; public string Name => "meanQualityDistribution"; public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get { return new { - lowestScore = lowestScore - ILLUMINA_BASE_ADJUSTMENT, - highestScore = highestScore - ILLUMINA_BASE_ADJUSTMENT, - distribution = qualityScores.Skip(lowestScore).Take(highestScore - lowestScore + 1) + lowestMean = lowestMean - ILLUMINA_BASE_ADJUSTMENT, + highestMean = highestMean - ILLUMINA_BASE_ADJUSTMENT, + distribution = qualityScores.Skip((int)lowestMean).Take((int)(highestMean - lowestMean + 1)) }; } } @@ -41,16 +43,17 @@ public void ProcessSequence(Sequence sequence) var count = 0; var qual = sequence.Quality; - for (var q = 0; q < sequenceLength; q++) + for (var i = 0; i < sequenceLength; i++) { - lowestScore = Math.Min(lowestScore, qual[q]); - highestScore = Math.Max(highestScore, qual[q]); - - sum += qual[q]; + sum += qual[i]; count++; } - var mean = sum / count; + var mean = (uint)(sum / count); + + lowestMean = Math.Min(lowestMean, mean); + highestMean = Math.Max(highestMean, mean); + qualityScores[mean]++; } @@ -61,8 +64,8 @@ public void Reset() qualityScores[p] = 0; } - lowestScore = byte.MaxValue; - highestScore = byte.MinValue; + lowestMean = uint.MaxValue; + highestMean = uint.MinValue; } } } \ No newline at end of file diff --git a/Modules/ModuleFactory.cs b/Modules/ModuleFactory.cs index 9e8a1d4..dea2b79 100644 --- a/Modules/ModuleFactory.cs +++ b/Modules/ModuleFactory.cs @@ -1,35 +1,58 @@ +using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; using Ovation.FasterQC.Net.Utils; namespace Ovation.FasterQC.Net.Modules { public static class ModuleFactory { - private static readonly Dictionary moduleMap = new() + private static Dictionary? moduleMap; + + public static Dictionary ModuleMap { - ["AlignmentStatistics"] = new AlignmentStatistics(), - ["BasicStatistics"] = new BasicStatistics(), - ["KMerContent"] = new KMerContent(), - ["NCountsAtPosition"] = new NCountsAtPosition(), - ["PerPositionSequenceContent"] = new PerPositionSequenceContent(), - ["PerSequenceGcContent"] = new PerSequenceGcContent(), - ["QualityDistributionByBase"] = new QualityDistributionByBase(), - ["MeanQualityDistribution"] = new MeanQualityDistribution(), - ["SequenceLengthDistribution"] = new SequenceLengthDistribution(), - ["PerPositionQuality"] = new PerPositionQuality(), - }; + get + { + if (moduleMap == null) + { + moduleMap = new Dictionary(); + + var modules = Assembly.GetExecutingAssembly() + .GetTypes() + .Where(t => string.IsNullOrEmpty(t.Namespace) == false && t.GetInterface(nameof(IQcModule)) != null) + .Select(t => Activator.CreateInstance(t)) + .Cast(); + + foreach (var module in modules) + { + moduleMap.Add(module.GetType().Name, module); + } + } + + return moduleMap; + } + } public static IEnumerable Create(CliOptions settings) { if (settings.ModuleNames.Any() == false || settings.ModuleNames.First() == "all") { - settings.ModuleNames = moduleMap.Keys; - return moduleMap.Values; + var moduleNames = new List(); + var modules = new List(); + + foreach (var module in ModuleMap.Where(m => m.Value.IsEnabledForAll == true)) + { + moduleNames.Add(module.Key); + modules.Add(module.Value); + } + + settings.ModuleNames = moduleNames; + return modules; } else { - return settings.ModuleNames.Select(n => moduleMap[n]); + return settings.ModuleNames.Select(n => ModuleMap[n]); } } } diff --git a/Modules/NCountsAtPosition.cs b/Modules/NCountsAtPosition.cs index d458a3a..91ee16c 100644 --- a/Modules/NCountsAtPosition.cs +++ b/Modules/NCountsAtPosition.cs @@ -5,12 +5,15 @@ namespace Ovation.FasterQC.Net public class NCountsAtPosition : IQcModule { private int[] nCounts = Array.Empty(); + private int[] notNCounts = Array.Empty(); public string Name => "nPercentages"; public string Description => "Calculates N counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionQuality.cs b/Modules/PerPositionQuality.cs index 88e0e2c..90e1ef9 100644 --- a/Modules/PerPositionQuality.cs +++ b/Modules/PerPositionQuality.cs @@ -8,12 +8,15 @@ public class PerPositionQuality : IQcModule private QualityMetric[] qualities = Array.Empty(); private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; public string Name => "perPositionQuality"; public string Description => "Calculates the per-position quality metrics"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerPositionSequenceContent.cs b/Modules/PerPositionSequenceContent.cs index c3f2ed0..e4e213d 100644 --- a/Modules/PerPositionSequenceContent.cs +++ b/Modules/PerPositionSequenceContent.cs @@ -6,8 +6,11 @@ namespace Ovation.FasterQC.Net public class PerPositionSequenceContent : IQcModule { private ulong[] aCounts = Array.Empty(); + private ulong[] cCounts = Array.Empty(); + private ulong[] tCounts = Array.Empty(); + private ulong[] gCounts = Array.Empty(); private ulong sequenceCount; @@ -15,6 +18,8 @@ public class PerPositionSequenceContent : IQcModule public string Description => "Calculates ATCG counts at position along sequence"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/PerSequenceGcContent.cs b/Modules/PerSequenceGcContent.cs index 1fd9687..9ba1f09 100644 --- a/Modules/PerSequenceGcContent.cs +++ b/Modules/PerSequenceGcContent.cs @@ -6,12 +6,15 @@ namespace Ovation.FasterQC.Net public class PerSequenceGcContent : IQcModule { private ulong[] gcCounts = Array.Empty(); + private ulong sequenceCount; public string Name => "gcDistribution"; public string Description => "Distribution of GC content percentages"; + public bool IsEnabledForAll => true; + public object Data => gcCounts.Select(a => Math.Round((double)a / (double)sequenceCount * 100.0, 3)); public void ProcessSequence(Sequence sequence) diff --git a/Modules/PolyX.cs b/Modules/PolyX.cs new file mode 100644 index 0000000..1635f41 --- /dev/null +++ b/Modules/PolyX.cs @@ -0,0 +1,111 @@ +namespace Ovation.FasterQC.Net +{ + public class PolyX : IQcModule + { + private const byte POLY_SIZE = 10; + + private readonly ulong[] polyA = new ulong[2]; + + private readonly ulong[] polyT = new ulong[2]; + + private readonly ulong[] polyC = new ulong[2]; + + private readonly ulong[] polyG = new ulong[2]; + + private readonly ulong[] polyN = new ulong[2]; + + private ulong sequenceCount; + + private ulong tooShort; + + public string Name => "polyX"; + + public string Description => "Calculates the the number of reads with polyX heads / tails"; + + public bool IsEnabledForAll => true; + + public object Data + { + get + { + return new + { + sequenceCount, + tooShort, + target = POLY_SIZE, + head = new { a = polyA[0], t = polyT[0], c = polyC[0], g = polyG[0], n = polyN[0] }, + tail = new { a = polyA[1], t = polyT[1], c = polyC[1], g = polyG[1], n = polyN[1] }, + }; + } + } + + public void ProcessSequence(Sequence sequence) + { + sequenceCount++; + + if (sequence.Read.Length < POLY_SIZE) + { + tooShort++; + return; + } + + if (CheckPolyHead((byte)'A', sequence.Read) == true) polyA[0]++; + if (CheckPolyTail((byte)'A', sequence.Read) == true) polyA[1]++; + + if (CheckPolyHead((byte)'T', sequence.Read) == true) polyT[0]++; + if (CheckPolyTail((byte)'T', sequence.Read) == true) polyT[1]++; + + if (CheckPolyHead((byte)'C', sequence.Read) == true) polyC[0]++; + if (CheckPolyTail((byte)'C', sequence.Read) == true) polyC[1]++; + + if (CheckPolyHead((byte)'G', sequence.Read) == true) polyG[0]++; + if (CheckPolyTail((byte)'G', sequence.Read) == true) polyG[1]++; + + if (CheckPolyHead((byte)'N', sequence.Read) == true) polyN[0]++; + if (CheckPolyTail((byte)'N', sequence.Read) == true) polyN[1]++; + } + + public void Reset() + { + polyA[0] = 0; polyA[1] = 0; + polyT[0] = 0; polyT[1] = 0; + polyC[0] = 0; polyC[1] = 0; + polyG[0] = 0; polyG[1] = 0; + polyN[0] = 0; polyN[1] = 0; + + sequenceCount = 0; + tooShort = 0; + } + + private static bool CheckPolyHead(byte test, byte[] sequence) + { + // consider a loop unroll here + for (var i = 0; i < POLY_SIZE; i++) + { + if (sequence[i] != test) + { + return false; + } + } + + return true; + } + + private static bool CheckPolyTail(byte test, byte[] sequence) + { + var endOfRead = sequence.Length - 1; + var endOfPoly = sequence.Length - (POLY_SIZE + 1); + + // consider a loop unroll here + for (var i = endOfRead; i > endOfPoly; i--) + { + if (sequence[i] != test) + { + return false; + } + } + + return true; + } + } +} diff --git a/Modules/QualityDistribution.cs b/Modules/QualityDistribution.cs new file mode 100644 index 0000000..666cc47 --- /dev/null +++ b/Modules/QualityDistribution.cs @@ -0,0 +1,66 @@ +using System; +using System.Linq; + +namespace Ovation.FasterQC.Net +{ + public class QualityDistribution : IQcModule + { + private const byte ILLUMINA_BASE_ADJUSTMENT = 33; + + private readonly ulong[] qualities = new ulong[128]; + + private byte lowestScore = byte.MaxValue; + + private byte highestScore = byte.MinValue; + + public string Name => "qualityDistribution"; + + public string Description => "Calculates the distribution of quality scores"; + + public bool IsEnabledForAll => true; + + public object Data + { + get + { + return new + { + lowestScore = lowestScore - ILLUMINA_BASE_ADJUSTMENT, + highestScore = highestScore - ILLUMINA_BASE_ADJUSTMENT, + + lowestScoreUnadjusted = lowestScore, + highestScoreUnadjusted = highestScore, + + qualities = qualities.Skip(lowestScore).Take(highestScore - lowestScore + 1), + }; + } + } + + public void ProcessSequence(Sequence sequence) + { + var sequenceLength = sequence.Read.Length; + var quals = sequence.Quality; + + for (var i = 0; i < sequenceLength; i++) + { + var qual = quals[i]; + + lowestScore = Math.Min(lowestScore, quals[i]); + highestScore = Math.Max(highestScore, quals[i]); + + qualities[qual]++; + } + } + + public void Reset() + { + for (var p = 0; p < qualities.Length; p++) + { + qualities[p] = 0; + } + + lowestScore = byte.MaxValue; + highestScore = byte.MinValue; + } + } +} diff --git a/Modules/QualityDistributionByBase.cs b/Modules/QualityDistributionByBase.cs index 285ce80..7b26f49 100644 --- a/Modules/QualityDistributionByBase.cs +++ b/Modules/QualityDistributionByBase.cs @@ -8,8 +8,11 @@ public class QualityDistributionByBase : IQcModule private const byte ILLUMINA_BASE_ADJUSTMENT = 33; private readonly ulong[] aQuality = new ulong[128]; + private readonly ulong[] cQuality = new ulong[128]; + private readonly ulong[] tQuality = new ulong[128]; + private readonly ulong[] gQuality = new ulong[128]; private byte lowestScore = byte.MaxValue; @@ -20,6 +23,8 @@ public class QualityDistributionByBase : IQcModule public string Description => "Calculates the quality distribution across all sequences"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Modules/SequenceLengthDistribution.cs b/Modules/SequenceLengthDistribution.cs index 27547f2..9fc5412 100644 --- a/Modules/SequenceLengthDistribution.cs +++ b/Modules/SequenceLengthDistribution.cs @@ -6,6 +6,7 @@ namespace Ovation.FasterQC.Net public class SequenceLengthDistribution : IQcModule { private int minimumReadLength = int.MaxValue; + private int maximumReadLength = int.MinValue; private readonly IDictionary lengths = new Dictionary(); @@ -14,6 +15,8 @@ public class SequenceLengthDistribution : IQcModule public string Description => "Calculates the sequence length distributions"; + public bool IsEnabledForAll => true; + public object Data { get diff --git a/Program.cs b/Program.cs index bfa192b..d1d5f20 100644 --- a/Program.cs +++ b/Program.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Text; using System.Text.Json; using CommandLine; using CommandLine.Text; @@ -33,21 +34,36 @@ private static void Main(string[] args) ); var parserResult = parser.ParseArguments(args); - parserResult - .WithParsed(o => - { - Settings = o; - new Program().Run(); - }) - .WithNotParsed(x => - { - var helpText = HelpText.AutoBuild(parserResult, h => + + parserResult.WithParsed(o => + { + Settings = o; + new Program().Run(); + }) + .WithNotParsed(errs => DisplayHelp(parserResult, errs)); + } + + static void DisplayHelp(ParserResult result, IEnumerable _) + { + var sb = new StringBuilder("List of available modules for --modules:").AppendLine(); + foreach (var module in ModuleFactory.ModuleMap) + { + sb.AppendLine($"\t{module.Key} -> {module.Value.Description}"); + } + + var helpText = HelpText.AutoBuild(result, + h => { - h.AutoVersion = false; // hides --version - return HelpText.DefaultParsingErrorsHandler(parserResult, h); - }, e => e); - Console.Error.WriteLine(helpText); - }); + h.AdditionalNewLineAfterOption = false; + h.MaximumDisplayWidth = 120; + h.AddPostOptionsText(sb.ToString()); + + return HelpText.DefaultParsingErrorsHandler(result, h); + }, + e => e + ); + + Console.Error.WriteLine(helpText); } private void Run() @@ -60,7 +76,7 @@ private void Run() On(Settings.ShowProgress, () => progressBar = new TimedSequenceProgressBar(sequenceReader)); On(Settings.Verbose, () => Console.Error.WriteLine($"Processing {Settings.InputFilename}...")); - while (sequenceReader.ReadSequence(out Sequence? sequence) && sequenceReader.SequencesRead < Settings.ReadLimit) + while (sequenceReader.SequencesRead < Settings.ReadLimit && sequenceReader.ReadSequence(out Sequence? sequence)) { ArgumentNullException.ThrowIfNull(sequence); @@ -81,9 +97,13 @@ private void Run() Dictionary? results = new() { - ["_modules"] = Settings.ModuleNames, - ["_inputFilename"] = Settings.InputFilename, - ["_outputFilename"] = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + ["_metadata"] = new + { + _modules = Settings.ModuleNames, + _inputFilename = Settings.InputFilename, + _outputFilename = string.IsNullOrWhiteSpace(Settings.OutputFilename) ? "STDOUT" : Settings.OutputFilename, + _sequences = sequenceReader.SequencesRead + } }; foreach (IQcModule? module in modules) diff --git a/R/fasterqc.R b/R/fasterqc.R new file mode 100644 index 0000000..acfd357 --- /dev/null +++ b/R/fasterqc.R @@ -0,0 +1,305 @@ +library(tidyverse) +library(jsonlite) +library(tidyjson) +library(ggplot2) +library(hrbrthemes) +library(tidyr) +library(stringr) +library(reshape2) +library(grid) + +# Multiple plot function +# +# ggplot objects can be passed in ..., or to plotlist +# (as a list of ggplot objects) +# - cols: Number of columns in layout +# - layout: A matrix specifying the layout. If present, 'cols' is ignored. +# +# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), +# then plot 1 will go in the upper left, 2 will go in the upper right, and +# 3 will go all the way across the bottom. +# +multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) { + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + num_plots <- length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(num_plots / cols)), + ncol = cols, nrow = ceiling(num_plots / cols) + ) + } + + if (num_plots == 1) { + print(plots[[1]]) + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:num_plots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport( + layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col + )) + } + } +} + +setwd("/Users/michaeljon/src/ovation/fasterqc.net/tmp") +sample <- "in3257_2_S1.sorted" + +json <- + jsonlite::fromJSON(readLines(paste(getwd(), "/", sample, ".json", sep = ""))) + +# plot the ACTGs +percent_actg <- data.frame( + as = json$baseCounts$aPercentage, + ts = json$baseCounts$tPercentage, + cs = json$baseCounts$cPercentage, + gs = json$baseCounts$gPercentage +) + +plt <- ggplot(percent_actg, aes(x = seq_along(as))) + + geom_line(aes(y = as, color = "%A")) + + geom_line(aes(y = ts, color = "%T")) + + geom_line(aes(y = cs, color = "%C")) + + geom_line(aes(y = gs, color = "%G")) + + coord_cartesian(ylim = c(0, 100)) + + labs( + color = "Nucleotide", + title = "Percentage of ACTG by sequence position", + subtitle = paste("Sample ", sample, sep = ""), + x = "Position in read", + y = "% of ACTG", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".pct-actg.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +# plot the N percentages +n_percentages <- data.frame( + n = json$nPercentages$percentages +) + +plt <- ggplot(n_percentages, aes(x = seq_along(n))) + + geom_line(aes(y = n)) + + coord_cartesian(ylim = c(0, 1)) + + labs( + title = "Percent of N reads", + subtitle = paste("Sample ", sample, sep = ""), + x = "Position in read", + y = "% N", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".pct-n.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +# plot GC distribution +gc_distribution <- data.frame( + gc = json$gcDistribution +) + +plt <- ggplot(gc_distribution, aes(x = seq_along(gc))) + + geom_line(aes(y = gc, color = "% GC")) + + # geom_line(aes(y = cumsum(gc), color = "Cumulative")) + + scale_y_continuous(labels = scales::comma) + + labs( + color = "", + title = "GC distribution", + subtitle = paste("Sample ", sample, sep = ""), + x = "Position in read", + y = "% GC", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".gc-dist.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +# plot quality distributions +qualities_by_base <- data.frame( + as = json$qualityDistributionByBase$aDistribution, + ts = json$qualityDistributionByBase$tDistribution, + cs = json$qualityDistributionByBase$cDistribution, + gs = json$qualityDistributionByBase$gDistribution +) + +plt <- ggplot(qualities_by_base, aes(x = seq_along(as))) + + geom_line(aes(y = as, color = "%A")) + + geom_line(aes(y = ts, color = "%T")) + + geom_line(aes(y = cs, color = "%C")) + + geom_line(aes(y = gs, color = "%G")) + + scale_y_continuous(labels = scales::comma, trans = "log10") + + labs( + color = "Nucleotide", + title = "Quality distribution by base", + subtitle = paste("Sample ", sample, sep = ""), + x = "Quality (phred)", + y = "# of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".base-qual-dist.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +# plot quality distributions +quality_dist <- data.frame( + qs = json$qualityDistribution$qualities +) + +plt <- ggplot(quality_dist, aes(x = seq_along(qs))) + + geom_col(aes(y = qs), fill = "#7faf7f") + + scale_y_continuous(labels = scales::comma, trans = "log10") + + labs( + color = "", + title = "Quality distribution", + subtitle = paste("Sample ", sample, sep = ""), + x = "Quality (phred)", + y = "# of bases (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".qual-dist.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +# plot quality distributions +quality_distribution <- data.frame( + qual = json$meanQualityDistribution$distribution, + low = json$meanQualityDistribution$lowestMean, + high = json$meanQualityDistribution$highestMean +) + +lowest_score <- min(quality_distribution$low) +highest_score <- max(quality_distribution$high) + +plt <- ggplot(quality_distribution, aes(x = lowest_score:highest_score)) + + geom_line(aes(y = qual)) + + scale_y_continuous(labels = scales::comma, trans = "log10") + + labs( + title = "Mean quality distribution", + subtitle = paste("Sample ", sample, sep = ""), + x = "Quality (phred)", + y = "# of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".mean-qual-dist.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +sequence_length_distribution <- data.frame( + len = json$sequenceLengthDistribution$distribution +) + +plt <- ggplot(sequence_length_distribution, aes(x = seq_along(len))) + + geom_line(aes(y = len)) + + scale_y_continuous(labels = scales::comma, trans = "log10") + + labs( + title = "Read count by length", + subtitle = paste("Sample ", sample, sep = ""), + x = "Length of read", + y = "# of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".readlen.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +per_position_qualities <- data.frame( + json$perPositionQual |> select(-distribution) +) + +lowest_score <- min(per_position_qualities$lowestScore) +highest_score <- min(per_position_qualities$highestScore) + +plt <- ggplot(per_position_qualities, aes(x = position)) + + geom_line(aes(y = mean, color = "mean")) + + geom_smooth(aes(y = lowest_score, color = "min"), level = 0.95) + + geom_smooth(aes(y = highest_score, color = "max"), level = 0.95) + + coord_cartesian(ylim = c(lowest_score, highest_score)) + + scale_y_continuous(labels = scales::comma) + + labs( + color = "", + title = "Read quality by position", + subtitle = paste("Sample ", sample, sep = ""), + x = "Position in read", + y = "Phred score", + caption = "Species: SARS-CoV-2", + ) + + theme_ipsum() + +png(paste(sample, ".qual.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +histogram <- data.frame( + paired = json$alignmentStatistics$histogram$paired, + unpaired = json$alignmentStatistics$histogram$unpaired +) + +minimum_read_length <- json$alignmentStatistics$histogram$minReadLength +maximum_read_length <- json$alignmentStatistics$histogram$maxReadLength + +histogram$lengths <- minimum_read_length:maximum_read_length + +plt <- ggplot(histogram, aes(x = lengths)) + + geom_col(aes(y = paired), fill = "#ff7f7f") + + scale_y_continuous(labels = scales::comma, trans = "log10") + + coord_cartesian(xlim = c(minimum_read_length, maximum_read_length)) + + labs( + color = "Legend", + title = "Paired segments", + subtitle = "Sample in3257_2_S1", + x = "Length of read", + y = "Number of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +png(paste(sample, ".paired.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() + +plt <- ggplot(histogram, aes(x = lengths)) + + geom_col(aes(y = unpaired), fill = "#7f7fff") + + scale_y_continuous(labels = scales::comma, trans = "log10") + + coord_cartesian(xlim = c(minimum_read_length, maximum_read_length)) + + labs( + color = "Legend", + title = "Aligned and paired segments", + subtitle = "Sample in3257_2_S1", + x = "Length of read", + y = "Number of sequences (log)", + caption = "Species: SARS-CoV-2" + ) + + theme_ipsum() + +# call multiplot(paired, unpaired) to put two of these on the page + +png(paste(sample, ".aligned.png", sep = ""), width = 800, height = 600) +print(plt) +dev.off() diff --git a/Readers/BamReader.cs b/Readers/BamReader.cs index fd548c6..6463cea 100644 --- a/Readers/BamReader.cs +++ b/Readers/BamReader.cs @@ -164,6 +164,12 @@ private BamAlignment ReadSequence() { bamAlignment.qual = Array.Empty(); } + offset += (int)bamAlignment.l_seq; + + while (offset < block_size) + { + _ = new BamOptionalElement(block, ref offset); + } return bamAlignment; } diff --git a/Utils/TimedSequenceProgressBar.cs b/Utils/TimedSequenceProgressBar.cs index 71e8290..51247fc 100644 --- a/Utils/TimedSequenceProgressBar.cs +++ b/Utils/TimedSequenceProgressBar.cs @@ -25,19 +25,27 @@ public TimedSequenceProgressBar(ISequenceReader sequenceReader) : base(100, "Pro public void Update(bool force = false) { - var read = sequenceReader.SequencesRead; - var percent = sequenceReader.ApproximateCompletion; + var sequencesRead = sequenceReader.SequencesRead; + var approximateCompletion = sequenceReader.ApproximateCompletion; - if (force || read % CliOptions.UpdatePeriod == 0) + if (force || sequencesRead % CliOptions.UpdatePeriod == 0) { - if (percent > 0) + // if we're limiting the number of reads then the reader's + // approximation will be incorrect (it's based on file positions), + // so we'll do the math ourselves + if (CliOptions.Settings.ReadLimit < ulong.MaxValue) { - var remainingTime = elapsedTime.ElapsedMilliseconds / percent * 100.0; + approximateCompletion = 100.0 * sequencesRead / CliOptions.Settings.ReadLimit; + } + + if (approximateCompletion > 0) + { + var remainingTime = elapsedTime.ElapsedMilliseconds / approximateCompletion * 100.0; EstimatedDuration = TimeSpan.FromMilliseconds(remainingTime); } - AsProgress().Report(percent / 100.0); - Message = $"{read.WithSsiUnits()} sequences completed"; + AsProgress().Report(approximateCompletion / 100.0); + Message = $"{sequencesRead.WithSsiUnits()} sequences completed"; } } }