diff --git a/.vscode/launch.json b/.vscode/launch.json index d62fcf7..5315e07 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,12 +12,23 @@ // If you have changed target frameworks, make sure to update the program path. "program": "${workspaceFolder}/bin/Debug/net6.0/Ovation.FasterQC.Net.dll", "args": [ - "-p", "-i", "/tmp/zr6254_1/zr6254_1.sorted.bam", "-o", "/tmp/bob.json", "-m", "BasicStatistics", "NCountsAtPosition" + "-v", + "-d", + "-f", + "sam", + "-i", + "./tmp/in3257_2_S1.sorted.sam", + "-o", + "./tmp/bob.json", + "-m", + "BasicStatistics", + "NCountsAtPosition" ], "cwd": "${workspaceFolder}", // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console "console": "internalConsole", - "stopAtEntry": false + "stopAtEntry": false, + "requireExactSource": false }, { "name": ".NET Core Attach", diff --git a/Interfaces/ISequenceReader.cs b/Interfaces/ISequenceReader.cs index fe37083..954aa5a 100644 --- a/Interfaces/ISequenceReader.cs +++ b/Interfaces/ISequenceReader.cs @@ -6,7 +6,7 @@ public interface ISequenceReader : IDisposable { int SequencesRead { get; } - bool ReadSequence(out Sequence sequence); + bool ReadSequence(out Sequence? sequence); double ApproximateCompletion { get; } } diff --git a/Models/BamAlignment.cs b/Models/BamAlignment.cs index 23dda7e..28cdc0f 100644 --- a/Models/BamAlignment.cs +++ b/Models/BamAlignment.cs @@ -1,6 +1,8 @@ +using System.Diagnostics.CodeAnalysis; + namespace Ovation.FasterQC.Net { -#pragma warning disable IDE1006 + [SuppressMessage("Code style", "IDE1006", Justification = "Names correspond to BAM structure field names")] public class BamAlignment { public uint block_size { get; set; } @@ -27,13 +29,12 @@ public class BamAlignment public int tlen { get; set; } - public byte[] read_name { get; set; } + public byte[] read_name { get; set; } = null!; - public uint[] cigar { get; set; } + public uint[] cigar { get; set; } = null!; - public byte[] seq { get; set; } + public byte[] seq { get; set; } = null!; - public byte[] qual { get; set; } + public byte[] qual { get; set; } = null!; } -#pragma warning restore IDE1006 -} \ No newline at end of file +} diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs new file mode 100644 index 0000000..5af2fae --- /dev/null +++ b/Models/ReadFlag.cs @@ -0,0 +1,68 @@ +using System; + +namespace Ovation.FasterQC.Net +{ + [Flags] + public enum ReadFlag : ushort + { + /// + /// template having multiple templates in sequencing (read is paired) + /// + Paired = 1, + + /// + /// each segment properly aligned according to the aligner (read mapped in proper pair) + /// + Aligned = 2, + + /// + /// segment unmapped (read1 unmapped) + /// + SegmentUnmapped = 4, + + /// + /// next segment in the template unmapped (read2 unmapped) + /// + NextSegmentUnmapped = 8, + + /// + /// SEQ being reverse complemented (read1 reverse complemented) + /// + ReverseComplemented = 16, + + /// + /// SEQ of the next segment in the template being reverse complemented (read2 reverse complemented) + /// + NextSegmentReverseComplemented = 32, + + /// + /// the first segment in the template (is read1) + /// + FirstSegment = 64, + + /// + /// the last segment in the template (is read2) + /// + LastSegment = 128, + + /// + /// not primary alignment + /// + NotPrimaryAlignment = 256, + + /// + /// alignment fails quality checks + /// + FailedQualityChecks = 512, + + /// + /// PCR or optical duplicate + /// + OpticalDuplicate = 1024, + + /// + /// supplementary alignment (e.g. aligner specific, could be a portion of a split read or a tied region) + /// + SupplementaryAlignment = 2048 + } +} \ No newline at end of file diff --git a/Models/Sequence.cs b/Models/Sequence.cs index e9eff94..b4bd196 100644 --- a/Models/Sequence.cs +++ b/Models/Sequence.cs @@ -5,6 +5,8 @@ namespace Ovation.FasterQC.Net { public class Sequence { + public ReadFlag ReadFlag { get; } + public byte[] Identifier { get; } public byte[] Read { get; } @@ -21,8 +23,9 @@ public Sequence(byte[] lines, int[] endOfLines) Quality = new ReadOnlyMemory(lines, endOfLines[2], endOfLines[3] - endOfLines[2]).ToArray(); } - public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality) + public Sequence(ushort readFlag, byte[] identifer, byte[] read, byte[] blank, byte[] quality) { + ReadFlag = (ReadFlag)readFlag; Identifier = new ReadOnlyMemory(identifer).ToArray(); Read = new ReadOnlyMemory(read).ToArray(); Blank = new ReadOnlyMemory(blank).ToArray(); @@ -31,15 +34,18 @@ public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality) public Sequence(BamAlignment bamAlignment) { + ReadFlag = (ReadFlag)bamAlignment.flag; Identifier = new ReadOnlyMemory(bamAlignment.read_name).ToArray(); Read = new ReadOnlyMemory(bamAlignment.seq).ToArray(); Quality = new ReadOnlyMemory(bamAlignment.qual).ToArray(); + Blank = Array.Empty(); } public override string ToString() { var sb = new StringBuilder("sequence: \n"); + sb.AppendLine(ReadFlag.ToString()); sb.AppendLine(new string(Encoding.ASCII.GetChars(Identifier))); sb.AppendLine(new string(Encoding.ASCII.GetChars(Read))); sb.AppendLine(new string(Encoding.ASCII.GetChars(Blank))); @@ -48,4 +54,4 @@ public override string ToString() return sb.ToString(); } } -} \ No newline at end of file +} diff --git a/Ovation.FasterQC.Net.csproj b/Ovation.FasterQC.Net.csproj index 9eb1d86..77ff3f2 100644 --- a/Ovation.FasterQC.Net.csproj +++ b/Ovation.FasterQC.Net.csproj @@ -3,6 +3,7 @@ Exe net6.0 + enable diff --git a/Program.cs b/Program.cs index 527619b..a502ffe 100644 --- a/Program.cs +++ b/Program.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Text.Json; using CommandLine; using Ovation.FasterQC.Net.Modules; @@ -19,17 +20,24 @@ class Program PropertyNamingPolicy = JsonNamingPolicy.CamelCase }; - private TimedSequenceProgressBar progressBar; + private TimedSequenceProgressBar? progressBar; static void Main(string[] args) { - Parser.Default.ParseArguments(args) - .WithParsed(o => + var parser = new Parser(config => { - o.Validate(); - Settings = o; - new Program().Run(); - }); + config.AutoHelp = true; + config.AutoVersion = true; + config.CaseInsensitiveEnumValues = true; + } + ); + + parser.ParseArguments(args) + .WithParsed(o => + { + Settings = o; + new Program().Run(); + }); } private void Run() @@ -42,14 +50,16 @@ private void Run() On(Settings.ShowProgress, () => progressBar = new TimedSequenceProgressBar(sequenceReader)); On(Settings.Verbose, () => Console.Error.WriteLine($"Processing {Settings.InputFilename}...")); - while (sequenceReader.ReadSequence(out Sequence sequence)) + while (sequenceReader.ReadSequence(out Sequence? sequence)) { + ArgumentNullException.ThrowIfNull(sequence); + foreach (var module in modules) { module.ProcessSequence(sequence); } - On(Settings.ShowProgress, () => progressBar.Update()); + On(Settings.ShowProgress, () => progressBar?.Update()); On(Settings.Verbose, () => { if (sequenceReader.SequencesRead % UpdatePeriod == 0) @@ -71,7 +81,7 @@ private void Run() results[module.Name] = module.Data; } - On(Settings.ShowProgress, () => progressBar.Update(force: true)); + On(Settings.ShowProgress, () => progressBar?.Update(force: true)); On(Settings.Verbose, () => Console.Error.WriteLine($"{sequenceReader.SequencesRead.WithSsiUnits()} sequences completed ({sequenceReader.ApproximateCompletion:0.0}%)")); if (string.IsNullOrWhiteSpace(Settings.OutputFilename)) diff --git a/Readers/BamReader.cs b/Readers/BamReader.cs index a2d9d4c..5c6f5a0 100644 --- a/Readers/BamReader.cs +++ b/Readers/BamReader.cs @@ -40,7 +40,7 @@ public BamReader(string bam) ConsumeHeader(); } - public bool ReadSequence(out Sequence sequence) + public bool ReadSequence(out Sequence? sequence) { try { @@ -106,9 +106,10 @@ private BamAlignment ReadSequence() var bamAlignment = new BamAlignment { block_size = block_size, + refID = BitConverter.ToInt32(block, offset) }; - bamAlignment.refID = BitConverter.ToInt32(block, offset); offset += 4; + offset += 4; bamAlignment.pos = BitConverter.ToInt32(block, offset) + 1; offset += 4; bamAlignment.l_read_name = block[offset]; offset += 1; bamAlignment.mapq = block[offset]; offset += 1; diff --git a/Readers/FastqLineReader.cs b/Readers/FastqLineReader.cs index de7611b..07efeac 100644 --- a/Readers/FastqLineReader.cs +++ b/Readers/FastqLineReader.cs @@ -10,7 +10,7 @@ public class FastqLineReader : ISequenceReader { private readonly FileStream inputStream; - private readonly GZipStream gzipStream; + private readonly GZipStream? gzipStream; private readonly BufferedStream bufferedStream; @@ -50,7 +50,7 @@ public FastqLineReader(string fastq, bool gzipped = true) } } - public bool ReadSequence(out Sequence sequence) + public bool ReadSequence(out Sequence? sequence) { try { @@ -61,12 +61,12 @@ public bool ReadSequence(out Sequence sequence) return false; } - var identifier = Encoding.ASCII.GetBytes(streamReader.ReadLine()); - var read = Encoding.ASCII.GetBytes(streamReader.ReadLine()); - var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine()); - var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine()); + var identifier = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); + var read = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); + var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); + var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); - sequence = new Sequence(identifier, read, blank, quality); + sequence = new Sequence(0, identifier, read, blank, quality); sequencesRead++; return true; } diff --git a/Readers/FastqReader.cs b/Readers/FastqReader.cs index 946ee2b..bbdca57 100644 --- a/Readers/FastqReader.cs +++ b/Readers/FastqReader.cs @@ -9,7 +9,7 @@ public class FastqReader : ISequenceReader { private readonly FileStream inputStream; - private readonly GZipStream gzipStream; + private readonly GZipStream? gzipStream; private readonly BufferedStream bufferedStream; @@ -49,7 +49,7 @@ public FastqReader(string fastq, bool gzipped = true) } } - public bool ReadSequence(out Sequence sequence) + public bool ReadSequence(out Sequence? sequence) { // this is clearly dangerous, instead read a large chunk of the file // and then walk through it returning only the consumed portion while diff --git a/Readers/ReaderFactory.cs b/Readers/ReaderFactory.cs index c8c34bf..2b8a204 100644 --- a/Readers/ReaderFactory.cs +++ b/Readers/ReaderFactory.cs @@ -7,10 +7,19 @@ public static class ReaderFactory { public static ISequenceReader Create(CliOptions settings) { - return settings switch + return settings.Format switch { - { Fastq: true } => new FastqLineReader(settings.InputFilename, true), - { Bam: true } => new BamReader(settings.InputFilename), + ReaderType.Fastq => new FastqReader(settings.InputFilename, false), + ReaderType.FastqGz => new FastqReader(settings.InputFilename, true), + + ReaderType.FastqLine => new FastqLineReader(settings.InputFilename, false), + ReaderType.FastqLineGz => new FastqLineReader(settings.InputFilename, true), + + ReaderType.Sam => new SamReader(settings.InputFilename, false), + ReaderType.SamGz => new SamReader(settings.InputFilename, true), + + ReaderType.Bam => new BamReader(settings.InputFilename), + _ => throw new InvalidOperationException($"could not determine file type of {settings.InputFilename}") }; } diff --git a/Readers/ReaderType.cs b/Readers/ReaderType.cs new file mode 100644 index 0000000..19b746a --- /dev/null +++ b/Readers/ReaderType.cs @@ -0,0 +1,19 @@ +namespace Ovation.FasterQC.Net +{ + public enum ReaderType + { + Fastq, + + FastqGz, + + FastqLine, + + FastqLineGz, + + Sam, + + SamGz, + + Bam + } +} \ No newline at end of file diff --git a/Readers/SamReader.cs b/Readers/SamReader.cs new file mode 100644 index 0000000..ec20c03 --- /dev/null +++ b/Readers/SamReader.cs @@ -0,0 +1,136 @@ +using System; +using System.Globalization; +using System.IO; +using System.IO.Compression; +using System.Text; +using static Ovation.FasterQC.Net.Utils.CliOptions; + +namespace Ovation.FasterQC.Net +{ + public class SamReader : ISequenceReader + { + private readonly FileStream inputStream; + + private readonly GZipStream? gzipStream; + + private readonly BufferedStream bufferedStream; + + private readonly StreamReader streamReader; + + private bool disposedValue; + + private int sequencesRead = 0; + + public int SequencesRead => sequencesRead; + + public SamReader(string sam, bool gzipped = true) + { + var bufferSize = 128 * 1024; + + var fileStreamOptions = new FileStreamOptions() + { + Mode = FileMode.Open, + BufferSize = bufferSize, + }; + + if (gzipped == true) + { + inputStream = File.Open(sam, fileStreamOptions); + gzipStream = new GZipStream(inputStream, CompressionMode.Decompress); + bufferedStream = new BufferedStream(gzipStream, bufferSize); + streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); + } + else + { + inputStream = File.Open(sam, fileStreamOptions); + bufferedStream = new BufferedStream(inputStream, bufferSize); + streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); + } + + ConsumeHeader(); + } + + private void ConsumeHeader() + { + try + { + while (streamReader.Peek() == '@') + { + var header = streamReader.ReadLine(); + On(Settings.Debug, () => Console.Error.WriteLine(header)); + } + } + catch (EndOfStreamException) + { + // swallow this, we've run out of file and a call + // into ReadSequence will handle the EOF case + } + } + + public bool ReadSequence(out Sequence? sequence) + { + try + { + if (streamReader.EndOfStream == true) + { + goto endofstream; + } + + var entry = streamReader.ReadLine(); + if (entry == null) + { + goto endofstream; + } + + // this is clearly a bad approach, we're going to be allocating a + // ton of small strings here, probably better to read the line, + // find the tabs ourselves, then pull the bytes out of the components + var parts = entry.Split('\t', StringSplitOptions.TrimEntries); + + var identifier = Encoding.ASCII.GetBytes(parts[0]); + var flag = ushort.Parse(parts[1], CultureInfo.InvariantCulture); + var read = Encoding.ASCII.GetBytes(parts[9]); + var blank = Encoding.ASCII.GetBytes(""); + var quality = Encoding.ASCII.GetBytes(parts[10]); + + sequence = new Sequence(flag, identifier, read, blank, quality); + sequencesRead++; + return true; + } + catch (EndOfStreamException) + { + goto endofstream; + } + + endofstream: + On(Settings.Verbose, () => Console.Error.WriteLine("End of stream")); + sequence = null; + return false; + } + + public double ApproximateCompletion => + 100.0 * inputStream.Position / inputStream.Length; + + protected virtual void Dispose(bool disposing) + { + if (!disposedValue) + { + if (disposing) + { + streamReader?.Dispose(); + bufferedStream?.Dispose(); + gzipStream?.Dispose(); + inputStream?.Dispose(); + } + + disposedValue = true; + } + } + + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + } +} diff --git a/Utils/CliOptions.cs b/Utils/CliOptions.cs index 3c6422b..a5639c2 100644 --- a/Utils/CliOptions.cs +++ b/Utils/CliOptions.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Globalization; using CommandLine; namespace Ovation.FasterQC.Net.Utils @@ -10,32 +9,25 @@ public class CliOptions [Option('v', "verbose", Required = false, SetName = "Verbose", HelpText = "Set output to verbose messages.")] public bool Verbose { get; set; } - [Option("debug", Required = false, SetName = "Verbose", HelpText = "Show diagnostic output. Can only use with --verbose.")] + [Option('d', "debug", Required = false, SetName = "Verbose", HelpText = "Show diagnostic output. Can only use with --verbose.")] public bool Debug { get; set; } [Option('p', "progress", Required = false, SetName = "Progress", HelpText = "Show progress bar. Cannnot use with --verbose.")] public bool ShowProgress { get; set; } [Option('i', "input", Required = true, HelpText = "Input filename.")] - public string InputFilename { get; set; } + public string InputFilename { get; set; } = null!; [Option('o', "output", Required = false, HelpText = "Output filename. Defaults to STDOUT.")] - public string OutputFilename { get; set; } + public string OutputFilename { get; set; } = null!; - [Option('b', "bam", Required = false, HelpText = "Assume BAM format.")] - public bool Bam { get; set; } - - [Option('f', "fastq", Required = false, HelpText = "Assume FASTQ format.")] - public bool Fastq { get; set; } - - [Option('z', "zipped", Required = false, HelpText = "Assume input file is gzipped.")] - public bool Zipped { get; set; } + [Option('f', "format", Required = true, HelpText = "Type of input file.")] + public ReaderType Format { get; set; } [Option('m', "modules", Required = false, HelpText = "Space-separated list of modules to run, or 'all'.")] public IEnumerable ModuleNames { get; set; } - public static CliOptions Settings - { get; set; } + public static CliOptions Settings { get; set; } = null!; public const int UpdatePeriod = 100_000; @@ -46,27 +38,5 @@ public static void On(bool condition, Action action) action(); } } - - public bool Validate() - { - if (!(Bam || Fastq)) - { - Fastq = InputFilename.EndsWith(".fastq", ignoreCase: true, culture: CultureInfo.InvariantCulture) - || InputFilename.EndsWith(".fastq.gz", ignoreCase: true, culture: CultureInfo.InvariantCulture); - - Bam = InputFilename.EndsWith(".bam", ignoreCase: true, culture: CultureInfo.InvariantCulture) - || InputFilename.EndsWith(".bam.gz", ignoreCase: true, culture: CultureInfo.InvariantCulture); - } - - if (Zipped == false && !string.IsNullOrWhiteSpace(InputFilename)) - { - if (InputFilename.EndsWith(".gz", ignoreCase: true, culture: CultureInfo.InvariantCulture)) - { - Zipped = true; - } - } - - return Fastq || Bam; - } } }