From 19bbc09b9210c4790d92773289b949b823f27c28 Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 09:48:55 -0700 Subject: [PATCH 1/2] issue-8: adding samreader --- .vscode/launch.json | 15 +++- Models/ReadFlag.cs | 57 ++++++++++++++++ Models/Sequence.cs | 7 +- Program.cs | 20 ++++-- Readers/FastqLineReader.cs | 2 +- Readers/ReaderFactory.cs | 15 +++- Readers/ReaderType.cs | 19 ++++++ Readers/SamReader.cs | 136 +++++++++++++++++++++++++++++++++++++ Utils/CliOptions.cs | 35 +--------- 9 files changed, 261 insertions(+), 45 deletions(-) create mode 100644 Models/ReadFlag.cs create mode 100644 Readers/ReaderType.cs create mode 100644 Readers/SamReader.cs diff --git a/.vscode/launch.json b/.vscode/launch.json index d62fcf7..5315e07 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,12 +12,23 @@ // If you have changed target frameworks, make sure to update the program path. "program": "${workspaceFolder}/bin/Debug/net6.0/Ovation.FasterQC.Net.dll", "args": [ - "-p", "-i", "/tmp/zr6254_1/zr6254_1.sorted.bam", "-o", "/tmp/bob.json", "-m", "BasicStatistics", "NCountsAtPosition" + "-v", + "-d", + "-f", + "sam", + "-i", + "./tmp/in3257_2_S1.sorted.sam", + "-o", + "./tmp/bob.json", + "-m", + "BasicStatistics", + "NCountsAtPosition" ], "cwd": "${workspaceFolder}", // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console "console": "internalConsole", - "stopAtEntry": false + "stopAtEntry": false, + "requireExactSource": false }, { "name": ".NET Core Attach", diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs new file mode 100644 index 0000000..0848cbb --- /dev/null +++ b/Models/ReadFlag.cs @@ -0,0 +1,57 @@ +using System; + +namespace Ovation.FasterQC.Net +{ + [Flags] + public enum ReadFlag : ushort + { + /// + /// template having multiple templates in sequencing (read is paired) + /// + Paired = 1, + + /// + /// + Aligned = 2, + + /// + /// + SegmentUnmapped = 4, + + /// + /// + NextSegmentUnmapped = 8, + + /// + /// + ReverseComplemented = 16, + + /// + /// + NextSegmentReverseComplemented = 32, + + /// + /// + FirstSegment = 64, + + /// + /// + LastSegment = 128, + + /// + /// + NotPrimaryAlignment = 256, + + /// + /// + FailedQualityChecks = 512, + + /// + /// + OpticalDuplicate = 1024, + + /// + /// + SupplementaryAlignment = 2048 + } +} \ No newline at end of file diff --git a/Models/Sequence.cs b/Models/Sequence.cs index 3c63b88..b4bd196 100644 --- a/Models/Sequence.cs +++ b/Models/Sequence.cs @@ -5,6 +5,8 @@ namespace Ovation.FasterQC.Net { public class Sequence { + public ReadFlag ReadFlag { get; } + public byte[] Identifier { get; } public byte[] Read { get; } @@ -21,8 +23,9 @@ public Sequence(byte[] lines, int[] endOfLines) Quality = new ReadOnlyMemory(lines, endOfLines[2], endOfLines[3] - endOfLines[2]).ToArray(); } - public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality) + public Sequence(ushort readFlag, byte[] identifer, byte[] read, byte[] blank, byte[] quality) { + ReadFlag = (ReadFlag)readFlag; Identifier = new ReadOnlyMemory(identifer).ToArray(); Read = new ReadOnlyMemory(read).ToArray(); Blank = new ReadOnlyMemory(blank).ToArray(); @@ -31,6 +34,7 @@ public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality) public Sequence(BamAlignment bamAlignment) { + ReadFlag = (ReadFlag)bamAlignment.flag; Identifier = new ReadOnlyMemory(bamAlignment.read_name).ToArray(); Read = new ReadOnlyMemory(bamAlignment.seq).ToArray(); Quality = new ReadOnlyMemory(bamAlignment.qual).ToArray(); @@ -41,6 +45,7 @@ public override string ToString() { var sb = new StringBuilder("sequence: \n"); + sb.AppendLine(ReadFlag.ToString()); sb.AppendLine(new string(Encoding.ASCII.GetChars(Identifier))); sb.AppendLine(new string(Encoding.ASCII.GetChars(Read))); sb.AppendLine(new string(Encoding.ASCII.GetChars(Blank))); diff --git a/Program.cs b/Program.cs index fb15b01..a502ffe 100644 --- a/Program.cs +++ b/Program.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Text.Json; using CommandLine; using Ovation.FasterQC.Net.Modules; @@ -23,13 +24,20 @@ class Program static void Main(string[] args) { - Parser.Default.ParseArguments(args) - .WithParsed(o => + var parser = new Parser(config => { - o.Validate(); - Settings = o; - new Program().Run(); - }); + config.AutoHelp = true; + config.AutoVersion = true; + config.CaseInsensitiveEnumValues = true; + } + ); + + parser.ParseArguments(args) + .WithParsed(o => + { + Settings = o; + new Program().Run(); + }); } private void Run() diff --git a/Readers/FastqLineReader.cs b/Readers/FastqLineReader.cs index 9c2c264..07efeac 100644 --- a/Readers/FastqLineReader.cs +++ b/Readers/FastqLineReader.cs @@ -66,7 +66,7 @@ public bool ReadSequence(out Sequence? sequence) var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); - sequence = new Sequence(identifier, read, blank, quality); + sequence = new Sequence(0, identifier, read, blank, quality); sequencesRead++; return true; } diff --git a/Readers/ReaderFactory.cs b/Readers/ReaderFactory.cs index 8b63848..2b8a204 100644 --- a/Readers/ReaderFactory.cs +++ b/Readers/ReaderFactory.cs @@ -7,10 +7,19 @@ public static class ReaderFactory { public static ISequenceReader Create(CliOptions settings) { - return settings switch + return settings.Format switch { - { Fastq: true } => new FastqLineReader(settings.InputFilename, settings.Zipped), - { Bam: true } => new BamReader(settings.InputFilename), + ReaderType.Fastq => new FastqReader(settings.InputFilename, false), + ReaderType.FastqGz => new FastqReader(settings.InputFilename, true), + + ReaderType.FastqLine => new FastqLineReader(settings.InputFilename, false), + ReaderType.FastqLineGz => new FastqLineReader(settings.InputFilename, true), + + ReaderType.Sam => new SamReader(settings.InputFilename, false), + ReaderType.SamGz => new SamReader(settings.InputFilename, true), + + ReaderType.Bam => new BamReader(settings.InputFilename), + _ => throw new InvalidOperationException($"could not determine file type of {settings.InputFilename}") }; } diff --git a/Readers/ReaderType.cs b/Readers/ReaderType.cs new file mode 100644 index 0000000..19b746a --- /dev/null +++ b/Readers/ReaderType.cs @@ -0,0 +1,19 @@ +namespace Ovation.FasterQC.Net +{ + public enum ReaderType + { + Fastq, + + FastqGz, + + FastqLine, + + FastqLineGz, + + Sam, + + SamGz, + + Bam + } +} \ No newline at end of file diff --git a/Readers/SamReader.cs b/Readers/SamReader.cs new file mode 100644 index 0000000..ec20c03 --- /dev/null +++ b/Readers/SamReader.cs @@ -0,0 +1,136 @@ +using System; +using System.Globalization; +using System.IO; +using System.IO.Compression; +using System.Text; +using static Ovation.FasterQC.Net.Utils.CliOptions; + +namespace Ovation.FasterQC.Net +{ + public class SamReader : ISequenceReader + { + private readonly FileStream inputStream; + + private readonly GZipStream? gzipStream; + + private readonly BufferedStream bufferedStream; + + private readonly StreamReader streamReader; + + private bool disposedValue; + + private int sequencesRead = 0; + + public int SequencesRead => sequencesRead; + + public SamReader(string sam, bool gzipped = true) + { + var bufferSize = 128 * 1024; + + var fileStreamOptions = new FileStreamOptions() + { + Mode = FileMode.Open, + BufferSize = bufferSize, + }; + + if (gzipped == true) + { + inputStream = File.Open(sam, fileStreamOptions); + gzipStream = new GZipStream(inputStream, CompressionMode.Decompress); + bufferedStream = new BufferedStream(gzipStream, bufferSize); + streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); + } + else + { + inputStream = File.Open(sam, fileStreamOptions); + bufferedStream = new BufferedStream(inputStream, bufferSize); + streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); + } + + ConsumeHeader(); + } + + private void ConsumeHeader() + { + try + { + while (streamReader.Peek() == '@') + { + var header = streamReader.ReadLine(); + On(Settings.Debug, () => Console.Error.WriteLine(header)); + } + } + catch (EndOfStreamException) + { + // swallow this, we've run out of file and a call + // into ReadSequence will handle the EOF case + } + } + + public bool ReadSequence(out Sequence? sequence) + { + try + { + if (streamReader.EndOfStream == true) + { + goto endofstream; + } + + var entry = streamReader.ReadLine(); + if (entry == null) + { + goto endofstream; + } + + // this is clearly a bad approach, we're going to be allocating a + // ton of small strings here, probably better to read the line, + // find the tabs ourselves, then pull the bytes out of the components + var parts = entry.Split('\t', StringSplitOptions.TrimEntries); + + var identifier = Encoding.ASCII.GetBytes(parts[0]); + var flag = ushort.Parse(parts[1], CultureInfo.InvariantCulture); + var read = Encoding.ASCII.GetBytes(parts[9]); + var blank = Encoding.ASCII.GetBytes(""); + var quality = Encoding.ASCII.GetBytes(parts[10]); + + sequence = new Sequence(flag, identifier, read, blank, quality); + sequencesRead++; + return true; + } + catch (EndOfStreamException) + { + goto endofstream; + } + + endofstream: + On(Settings.Verbose, () => Console.Error.WriteLine("End of stream")); + sequence = null; + return false; + } + + public double ApproximateCompletion => + 100.0 * inputStream.Position / inputStream.Length; + + protected virtual void Dispose(bool disposing) + { + if (!disposedValue) + { + if (disposing) + { + streamReader?.Dispose(); + bufferedStream?.Dispose(); + gzipStream?.Dispose(); + inputStream?.Dispose(); + } + + disposedValue = true; + } + } + + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + } +} diff --git a/Utils/CliOptions.cs b/Utils/CliOptions.cs index a9e4b76..a4b4a29 100644 --- a/Utils/CliOptions.cs +++ b/Utils/CliOptions.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Globalization; using CommandLine; namespace Ovation.FasterQC.Net.Utils @@ -10,7 +9,7 @@ public class CliOptions [Option('v', "verbose", Required = false, SetName = "Verbose", HelpText = "Set output to verbose messages.")] public bool Verbose { get; set; } - [Option("debug", Required = false, SetName = "Verbose", HelpText = "Show diagnostic output. Can only use with --verbose.")] + [Option('d', "debug", Required = false, SetName = "Verbose", HelpText = "Show diagnostic output. Can only use with --verbose.")] public bool Debug { get; set; } [Option('p', "progress", Required = false, SetName = "Progress", HelpText = "Show progress bar. Cannnot use with --verbose.")] @@ -22,14 +21,8 @@ public class CliOptions [Option('o', "output", Required = false, HelpText = "Output filename. Defaults to STDOUT.")] public string OutputFilename { get; set; } = null!; - [Option('b', "bam", Required = false, HelpText = "Assume BAM format.")] - public bool Bam { get; set; } - - [Option('f', "fastq", Required = false, HelpText = "Assume FASTQ format.")] - public bool Fastq { get; set; } - - [Option('z', "zipped", Required = false, HelpText = "Assume input file is gzipped.")] - public bool Zipped { get; set; } + [Option('f', "format", Required = true, HelpText = "Type of input file.")] + public ReaderType Format { get; set; } [Option('m', "modules", Required = true, Min = 1, HelpText = "Space-separated list of modules to run, or 'all'.")] public IEnumerable ModuleNames { get; set; } = Array.Empty(); @@ -45,27 +38,5 @@ public static void On(bool condition, Action action) action(); } } - - public bool Validate() - { - if (!(Bam || Fastq)) - { - Fastq = InputFilename.EndsWith(".fastq", ignoreCase: true, culture: CultureInfo.InvariantCulture) - || InputFilename.EndsWith(".fastq.gz", ignoreCase: true, culture: CultureInfo.InvariantCulture); - - Bam = InputFilename.EndsWith(".bam", ignoreCase: true, culture: CultureInfo.InvariantCulture) - || InputFilename.EndsWith(".bam.gz", ignoreCase: true, culture: CultureInfo.InvariantCulture); - } - - if (Zipped == false && !string.IsNullOrWhiteSpace(InputFilename)) - { - if (InputFilename.EndsWith(".gz", ignoreCase: true, culture: CultureInfo.InvariantCulture)) - { - Zipped = true; - } - } - - return Fastq || Bam; - } } } From a8df81966bf1379e7549c4d325d75d473168445c Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Wed, 18 May 2022 09:54:38 -0700 Subject: [PATCH 2/2] issue-8: added missing docs --- Models/ReadFlag.cs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs index 0848cbb..5af2fae 100644 --- a/Models/ReadFlag.cs +++ b/Models/ReadFlag.cs @@ -11,46 +11,57 @@ public enum ReadFlag : ushort Paired = 1, /// + /// each segment properly aligned according to the aligner (read mapped in proper pair) /// Aligned = 2, /// + /// segment unmapped (read1 unmapped) /// SegmentUnmapped = 4, /// + /// next segment in the template unmapped (read2 unmapped) /// NextSegmentUnmapped = 8, /// + /// SEQ being reverse complemented (read1 reverse complemented) /// ReverseComplemented = 16, /// + /// SEQ of the next segment in the template being reverse complemented (read2 reverse complemented) /// NextSegmentReverseComplemented = 32, /// + /// the first segment in the template (is read1) /// FirstSegment = 64, /// + /// the last segment in the template (is read2) /// LastSegment = 128, /// + /// not primary alignment /// NotPrimaryAlignment = 256, /// + /// alignment fails quality checks /// FailedQualityChecks = 512, /// + /// PCR or optical duplicate /// OpticalDuplicate = 1024, /// + /// supplementary alignment (e.g. aligner specific, could be a portion of a split read or a tied region) /// SupplementaryAlignment = 2048 }