From 30b7b84398821f9a5383c5dcfa1f354c02495eed Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Thu, 7 Jul 2022 12:43:34 -0700 Subject: [PATCH] adding multiqc-like output --- .vscode/launch.json | 8 +- Models/ReadFlag.cs | 6 +- Modules/AlignmentStatistics.cs | 12 +-- Modules/MultiQC.cs | 167 +++++++++++++++++++++++++++++++++ Readers/BamReader.cs | 2 +- Utils/Statistics.cs | 86 +++++++++++++++++ 6 files changed, 266 insertions(+), 15 deletions(-) create mode 100644 Modules/MultiQC.cs create mode 100644 Utils/Statistics.cs diff --git a/.vscode/launch.json b/.vscode/launch.json index 2b1df33..2487a3f 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,16 +13,14 @@ "program": "${workspaceFolder}/bin/Debug/net6.0/Ovation.FasterQC.Net.dll", "args": [ "-d", - "-l", - "10", "-f", "bam", "-i", - "./tmp/in3257_2_S1.sorted.bam", + "/Volumes/Untitled/experiment/OUT/bob/6111_S7_L001/6111_S7_L001.aligned.bam", "-o", - "./tmp/bob.json", + "/Volumes/Untitled/experiment/OUT/bob/bob.json", "-m", - "BasicStatistics" + "MultiQC" ], "cwd": "${workspaceFolder}", // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs index effa7c3..ecbb016 100644 --- a/Models/ReadFlag.cs +++ b/Models/ReadFlag.cs @@ -40,14 +40,14 @@ public enum ReadFlag : ushort /// /// the first segment in the template (is read1) /// - FirstSegment = 64, + Read1 = 64, /// /// the last segment in the template (is read2) /// - LastSegment = 128, + Read2 = 128, - EmbeddedSegment = FirstSegment | LastSegment, + EmbeddedSegment = Read1 | Read2, /// /// not primary alignment diff --git a/Modules/AlignmentStatistics.cs b/Modules/AlignmentStatistics.cs index 923ef15..cc0fd25 100644 --- a/Modules/AlignmentStatistics.cs +++ b/Modules/AlignmentStatistics.cs @@ -24,9 +24,9 @@ public class AlignmentStatistics : IQcModule private ulong nextSegmentReverseComplemented; - private ulong firstSegment; + private ulong readOne; - private ulong lastSegment; + private ulong readTwo; private ulong embeddedSegment; @@ -84,8 +84,8 @@ public void ProcessSequence(Sequence sequence) if ((sequence.ReadFlag & ReadFlag.ReverseComplemented) != 0) reverseComplemented++; if ((sequence.ReadFlag & ReadFlag.NextSegmentReverseComplemented) != 0) nextSegmentReverseComplemented++; - if ((sequence.ReadFlag & ReadFlag.FirstSegment) != 0) firstSegment++; - if ((sequence.ReadFlag & ReadFlag.LastSegment) != 0) lastSegment++; + if ((sequence.ReadFlag & ReadFlag.Read1) != 0) readOne++; + if ((sequence.ReadFlag & ReadFlag.Read2) != 0) readTwo++; if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == ReadFlag.EmbeddedSegment) embeddedSegment++; if ((sequence.ReadFlag & (ReadFlag.EmbeddedSegment)) == 0) unknownIndex++; @@ -132,8 +132,8 @@ public object Data nextSegmentUnmapped, reverseComplemented, nextSegmentReverseComplemented, - firstSegment, - lastSegment, + readOne, + readTwo, embeddedSegment, unknownIndex, primaryAlignment, diff --git a/Modules/MultiQC.cs b/Modules/MultiQC.cs new file mode 100644 index 0000000..90977f0 --- /dev/null +++ b/Modules/MultiQC.cs @@ -0,0 +1,167 @@ +using System; +using System.Linq; + +namespace Ovation.FasterQC.Net +{ + public class MultiQC : IQcModule + { + private const byte ILLUMINA_BASE_ADJUSTMENT_LOWER = 33; + + private const byte ILLUMINA_BASE_ADJUSTMENT_UPPER = 126; + + private int maxReadLength = 0; + + private ulong sequenceCount = 0; + + private ulong[][] qualityScoresAtPosition = Array.Empty(); + + private uint lowestMean = uint.MaxValue; + + private uint highestMean = uint.MinValue; + + private ulong[] nCountsAtPosition = Array.Empty(); + + private readonly ulong[] gcPercentageCounts = new ulong[101]; + + private readonly ulong[] meanPhredCounts = new ulong[128]; + + public string Name => "multiqc"; + + public string Description => "Generates limited / Ovation-branded output compatible with MultiQC"; + + public bool IsEnabledForAll => true; + + public ReaderType SupportedReaders => ReaderType.AllReaders; + + public object Data + { + get + { + var meanPhreds = new ulong[maxReadLength][]; + for (var pos = 0; pos < maxReadLength; pos++) + { + meanPhreds[pos] = new ulong[128]; + for (var qual = 0UL; qual < 128; qual++) + { + meanPhreds[pos][qual] = qualityScoresAtPosition[pos][qual] * qual; + } + } + + var phreds = meanPhredCounts + .Skip(ILLUMINA_BASE_ADJUSTMENT_LOWER) + .Take(ILLUMINA_BASE_ADJUSTMENT_UPPER - ILLUMINA_BASE_ADJUSTMENT_LOWER + 1) + .ToArray(); + + return new + { + meanPhredByPosition = meanPhreds + .Select((p, idx) => new { idx, val = (int)(p.Sum(q => (decimal)q) / sequenceCount) }) + .ToDictionary(i => i.idx + 1, i => i.val), + + gcPercentageCounts = gcPercentageCounts + .Select((gcc, idx) => new { idx, gcc }) + .ToDictionary(i => i.idx, i => i.gcc), + + gcPercentagePercentage = gcPercentageCounts + .Select((gcc, idx) => new { idx, val = Math.Round((float)gcc / (float)sequenceCount * 100.0, 3) }) + .ToDictionary(i => i.idx, i => i.val), + + nCountsAtPosition = nCountsAtPosition + .Select((nc, idx) => new { idx, nc }) + .ToDictionary(i => i.idx + 1, i => i.nc), + + nPercentageAtPosition = nCountsAtPosition + .Select((nc, idx) => new { idx, val = Math.Round((float)nc / (float)sequenceCount * 100.0, 3) }) + .ToDictionary(i => i.idx + 1, i => i.val), + + meanPhredCounts = meanPhredCounts + .Skip(ILLUMINA_BASE_ADJUSTMENT_LOWER) + .Take(ILLUMINA_BASE_ADJUSTMENT_UPPER - ILLUMINA_BASE_ADJUSTMENT_LOWER + 1) + .Select((val, idx) => new { idx, val }) + .ToDictionary(i => ILLUMINA_BASE_ADJUSTMENT_LOWER + i.idx, i => i.val), + + meanPhredPercentage = meanPhredCounts + .Skip(ILLUMINA_BASE_ADJUSTMENT_LOWER) + .Take(ILLUMINA_BASE_ADJUSTMENT_UPPER - ILLUMINA_BASE_ADJUSTMENT_LOWER + 1) + .Select((pc, idx) => new { idx, val = Math.Round((float)pc / (float)sequenceCount * 100.0, 3) }) + .ToDictionary(i => ILLUMINA_BASE_ADJUSTMENT_LOWER + i.idx, i => i.val), + + phreds = phreds + .Select((val, idx) => new { phred = Convert.ToChar(ILLUMINA_BASE_ADJUSTMENT_LOWER + idx), count = val }) + .ToDictionary(i => i.phred, i => i.count) + }; + } + } + + public void ProcessSequence(Sequence sequence) + { + sequenceCount++; + + var sequenceLength = sequence.Read.Length; + + // see if we need to resize our arrays + if (sequenceLength > maxReadLength) + { + maxReadLength = sequenceLength; + + Array.Resize(ref qualityScoresAtPosition, maxReadLength); + for (var n = 0; n < maxReadLength; n++) + { + if (qualityScoresAtPosition[n] == null) + { + qualityScoresAtPosition[n] = new ulong[128]; + } + } + Array.Resize(ref nCountsAtPosition, maxReadLength); + } + + // calculate mean quality score + var sum = 0; + var count = 0; + + var qual = sequence.Quality; + for (var i = 0; i < sequenceLength; i++) + { + byte q = qual[i]; + + qualityScoresAtPosition[i][q]++; + sum += q; + count++; + } + + var mean = (uint)((float)sum / (float)count); + meanPhredCounts[mean]++; + + lowestMean = Math.Min(lowestMean, mean); + highestMean = Math.Max(highestMean, mean); + + // calculate the gc percentage and ncount of the read + var gcCount = 0; + var read = sequence.Read; + for (var i = 0; i < sequenceLength; i++) + { + switch (read[i]) + { + case (byte)'G': + case (byte)'C': + gcCount++; + break; + + case (byte)'N': + nCountsAtPosition[i]++; + break; + + default: + break; + } + } + + var gcPercentage = (int)((float)gcCount / (float)sequenceLength * 100.0); + gcPercentageCounts[gcPercentage]++; + } + + public void Reset() + { + } + } +} diff --git a/Readers/BamReader.cs b/Readers/BamReader.cs index e7dfb78..ffe30c4 100644 --- a/Readers/BamReader.cs +++ b/Readers/BamReader.cs @@ -12,7 +12,7 @@ public class BamReader : AbstractReader private bool disposedValue; public BamReader(string bam) : - base(bam, false) + base(bam, true) { binaryReader = new BinaryReader(bufferedStream); diff --git a/Utils/Statistics.cs b/Utils/Statistics.cs new file mode 100644 index 0000000..e457124 --- /dev/null +++ b/Utils/Statistics.cs @@ -0,0 +1,86 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Ovation.FasterQC.Net.Utils +{ + public static class Statistics + { + /// + /// Partitions the given list around a pivot element such that all elements on left of pivot are <= pivot + /// and the ones at thr right are > pivot. This method can be used for sorting, N-order statistics such as + /// as median finding algorithms. + /// Pivot is selected ranodmly if random number generator is supplied else its selected as last element in the list. + /// Reference: Introduction to Algorithms 3rd Edition, Corman et al, pp 171 + /// + private static int Partition(this IList list, int start, int end, Random? rnd = null) where T : IComparable + { + if (rnd != null) + list.Swap(end, rnd.Next(start, end + 1)); + + var pivot = list[end]; + var lastLow = start - 1; + + for (var i = start; i < end; i++) + { + if (list[i].CompareTo(pivot) <= 0) + list.Swap(i, ++lastLow); + } + + list.Swap(end, ++lastLow); + + return lastLow; + } + + /// + /// Returns Nth smallest element from the list. Here n starts from 0 so that n=0 returns minimum, n=1 returns 2nd smallest element etc. + /// Note: specified list would be mutated in the process. + /// Reference: Introduction to Algorithms 3rd Edition, Corman et al, pp 216 + /// + public static T NthOrderStatistic(this IList list, int n, Random? rnd = null) where T : IComparable + { + return NthOrderStatistic(list, n, 0, list.Count - 1, rnd); + } + + private static T NthOrderStatistic(this IList list, int n, int start, int end, Random? rnd) where T : IComparable + { + while (true) + { + var pivotIndex = list.Partition(start, end, rnd); + if (pivotIndex == n) + return list[pivotIndex]; + + if (n < pivotIndex) + end = pivotIndex - 1; + else + start = pivotIndex + 1; + } + } + + public static void Swap(this IList list, int i, int j) + { + // This check is not required but Partition + // function may make many calls so it's for perf reason + if (i == j) + return; + + (list[j], list[i]) = (list[i], list[j]); + } + + /// + /// Note: specified list would be mutated in the process. + /// + public static T Median(this IList list) where T : IComparable + { + return list.NthOrderStatistic((list.Count - 1) / 2); + } + + public static double Median(this IEnumerable sequence, Func getValue) + { + var list = sequence.Select(getValue).ToList(); + var mid = (list.Count - 1) / 2; + + return list.NthOrderStatistic(mid); + } + } +} \ No newline at end of file