From 1b9499986ccb9e54611c3ba6788a6c5a15414996 Mon Sep 17 00:00:00 2001 From: Michaeljon Miller Date: Thu, 26 May 2022 18:45:20 -0700 Subject: [PATCH] issue-3: rationalize fastq readers --- Readers/AbstractReader.cs | 70 +++++++++++++++++++++++++ Readers/BamReader.cs | 43 ++-------------- Readers/FastqLineReader.cs | 103 ------------------------------------- Readers/FastqReader.cs | 92 +++++++-------------------------- Readers/ReaderFactory.cs | 3 -- Readers/ReaderType.cs | 4 -- Readers/SamReader.cs | 84 ++++++++---------------------- 7 files changed, 114 insertions(+), 285 deletions(-) create mode 100644 Readers/AbstractReader.cs delete mode 100644 Readers/FastqLineReader.cs diff --git a/Readers/AbstractReader.cs b/Readers/AbstractReader.cs new file mode 100644 index 0000000..5b43a03 --- /dev/null +++ b/Readers/AbstractReader.cs @@ -0,0 +1,70 @@ +using System; +using System.IO; +using System.IO.Compression; + +namespace Ovation.FasterQC.Net +{ + public abstract class AbstractReader : ISequenceReader + { + private readonly FileStream inputStream; + + private readonly GZipStream? gzipStream; + + protected readonly BufferedStream bufferedStream; + + protected readonly int bufferSize = 128 * 1024; + + private bool disposedValue; + + protected ulong sequencesRead = 0; + + public ulong SequencesRead => sequencesRead; + + public double ApproximateCompletion => + 100.0 * inputStream.Position / inputStream.Length; + + public AbstractReader(string filename, bool gzipped = true) + { + var fileStreamOptions = new FileStreamOptions() + { + Mode = FileMode.Open, + BufferSize = bufferSize, + }; + + if (gzipped == true) + { + inputStream = File.Open(filename, fileStreamOptions); + gzipStream = new GZipStream(inputStream, CompressionMode.Decompress); + bufferedStream = new BufferedStream(gzipStream, bufferSize); + } + else + { + inputStream = File.Open(filename, fileStreamOptions); + bufferedStream = new BufferedStream(inputStream, bufferSize); + } + } + + public abstract bool ReadSequence(out Sequence? sequence); + + protected virtual void Dispose(bool disposing) + { + if (!disposedValue) + { + if (disposing) + { + bufferedStream?.Dispose(); + gzipStream?.Dispose(); + inputStream?.Dispose(); + } + + disposedValue = true; + } + } + + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + } +} diff --git a/Readers/BamReader.cs b/Readers/BamReader.cs index 6463cea..e7dfb78 100644 --- a/Readers/BamReader.cs +++ b/Readers/BamReader.cs @@ -1,46 +1,25 @@ using System; using System.IO; -using System.IO.Compression; using System.Text; using static Ovation.FasterQC.Net.Utils.CliOptions; namespace Ovation.FasterQC.Net { - public class BamReader : ISequenceReader + public class BamReader : AbstractReader { - private readonly FileStream inputStream; - - private readonly GZipStream gzipStream; - - private readonly BufferedStream bufferedStream; - private readonly BinaryReader binaryReader; private bool disposedValue; - private ulong sequencesRead = 0; - - public ulong SequencesRead => sequencesRead; - - public BamReader(string bam) + public BamReader(string bam) : + base(bam, false) { - var bufferSize = 128 * 1024; - - var fileStreamOptions = new FileStreamOptions() - { - Mode = FileMode.Open, - BufferSize = bufferSize, - }; - - inputStream = File.Open(bam, fileStreamOptions); - gzipStream = new GZipStream(inputStream, CompressionMode.Decompress); - bufferedStream = new BufferedStream(gzipStream, bufferSize); binaryReader = new BinaryReader(bufferedStream); ConsumeHeader(); } - public bool ReadSequence(out Sequence? sequence) + public override bool ReadSequence(out Sequence? sequence) { try { @@ -57,9 +36,6 @@ public bool ReadSequence(out Sequence? sequence) } } - public double ApproximateCompletion => - 100.0 * inputStream.Position / inputStream.Length; - private void ConsumeHeader() { var magic = binaryReader.ReadBytes(4); @@ -174,26 +150,17 @@ private BamAlignment ReadSequence() return bamAlignment; } - protected virtual void Dispose(bool disposing) + protected override void Dispose(bool disposing) { if (!disposedValue) { if (disposing) { binaryReader?.Dispose(); - bufferedStream?.Dispose(); - gzipStream?.Dispose(); - inputStream?.Dispose(); } disposedValue = true; } } - - public void Dispose() - { - Dispose(true); - GC.SuppressFinalize(this); - } } } diff --git a/Readers/FastqLineReader.cs b/Readers/FastqLineReader.cs deleted file mode 100644 index 1943646..0000000 --- a/Readers/FastqLineReader.cs +++ /dev/null @@ -1,103 +0,0 @@ -using System; -using System.IO; -using System.IO.Compression; -using System.Text; -using static Ovation.FasterQC.Net.Utils.CliOptions; - -namespace Ovation.FasterQC.Net -{ - public class FastqLineReader : ISequenceReader - { - private readonly FileStream inputStream; - - private readonly GZipStream? gzipStream; - - private readonly BufferedStream bufferedStream; - - private readonly StreamReader streamReader; - - private bool disposedValue; - - private ulong sequencesRead = 0; - - public ulong SequencesRead => sequencesRead; - - public double ApproximateCompletion => - 100.0 * inputStream.Position / inputStream.Length; - - public FastqLineReader(string fastq, bool gzipped = true) - { - var bufferSize = 128 * 1024; - - var fileStreamOptions = new FileStreamOptions() - { - Mode = FileMode.Open, - BufferSize = bufferSize, - }; - - if (gzipped == true) - { - inputStream = File.Open(fastq, fileStreamOptions); - gzipStream = new GZipStream(inputStream, CompressionMode.Decompress); - bufferedStream = new BufferedStream(gzipStream, bufferSize); - streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); - } - else - { - inputStream = File.Open(fastq, fileStreamOptions); - bufferedStream = new BufferedStream(inputStream, bufferSize); - streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); - } - } - - public bool ReadSequence(out Sequence? sequence) - { - try - { - if (streamReader.EndOfStream == true) - { - On(Settings.Verbose, () => Console.Error.WriteLine("End of stream")); - sequence = null; - return false; - } - - var identifier = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); - var read = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); - var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); - var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); - - sequence = new Sequence(0, identifier, read, blank, quality); - sequencesRead++; - return true; - } - catch (EndOfStreamException) - { - On(Settings.Verbose, () => Console.Error.WriteLine("End of stream")); - sequence = null; - return false; - } - } - - protected virtual void Dispose(bool disposing) - { - if (!disposedValue) - { - if (disposing) - { - streamReader?.Dispose(); - bufferedStream?.Dispose(); - gzipStream?.Dispose(); - inputStream?.Dispose(); - } - - disposedValue = true; - } - } - - public void Dispose() - { - Dispose(true); - GC.SuppressFinalize(this); - } - } -} diff --git a/Readers/FastqReader.cs b/Readers/FastqReader.cs index 28bf629..437d2cb 100644 --- a/Readers/FastqReader.cs +++ b/Readers/FastqReader.cs @@ -1,86 +1,39 @@ using System; using System.IO; -using System.IO.Compression; +using System.Text; using static Ovation.FasterQC.Net.Utils.CliOptions; namespace Ovation.FasterQC.Net { - public class FastqReader : ISequenceReader + public class FastqReader : AbstractReader { - private readonly FileStream inputStream; - - private readonly GZipStream? gzipStream; - - private readonly BufferedStream bufferedStream; - - private readonly BinaryReader binaryReader; + protected readonly StreamReader streamReader; private bool disposedValue; - private ulong sequencesRead = 0; - - public ulong SequencesRead => sequencesRead; - - public double ApproximateCompletion => - 100.0 * inputStream.Position / inputStream.Length; - - public FastqReader(string fastq, bool gzipped = true) + public FastqReader(string fastq, bool gzipped) + : base(fastq, gzipped) { - var bufferSize = 128 * 1024; - - var fileStreamOptions = new FileStreamOptions() - { - Mode = FileMode.Open, - BufferSize = bufferSize, - }; - - if (gzipped == true) - { - inputStream = File.Open(fastq, fileStreamOptions); - gzipStream = new GZipStream(inputStream, CompressionMode.Decompress); - bufferedStream = new BufferedStream(gzipStream, bufferSize); - binaryReader = new BinaryReader(bufferedStream); - } - else - { - inputStream = File.Open(fastq, fileStreamOptions); - bufferedStream = new BufferedStream(inputStream, bufferSize); - binaryReader = new BinaryReader(bufferedStream); - } + streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); } - public bool ReadSequence(out Sequence? sequence) + public override bool ReadSequence(out Sequence? sequence) { - // this is clearly dangerous, instead read a large chunk of the file - // and then walk through it returning only the consumed portion while - // keeping track of the last byte consumed on the stream - byte[] bytes = new byte[1024]; - - int offset = 0; - int line = 0; - int[] endOfLines = new int[4]; - try { - while (line < 4) + if (streamReader.EndOfStream == true) { - var b = binaryReader.ReadByte(); - if (b == (byte)'\n') - { - endOfLines[line++] = offset; - } - else - { - bytes[offset++] = b; - } + On(Settings.Verbose, () => Console.Error.WriteLine("End of stream")); + sequence = null; + return false; } - for (var read = endOfLines[1]; read < endOfLines[2]; read++) - { - bytes[read] &= 0xdf; - } + var identifier = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); + var read = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); + var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); + var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? ""); - sequence = new Sequence(bytes, endOfLines); + sequence = new Sequence(0, identifier, read, blank, quality); sequencesRead++; return true; } @@ -92,26 +45,17 @@ public bool ReadSequence(out Sequence? sequence) } } - protected virtual void Dispose(bool disposing) + protected override void Dispose(bool disposing) { if (!disposedValue) { if (disposing) { - binaryReader?.Dispose(); - bufferedStream?.Dispose(); - gzipStream?.Dispose(); - inputStream?.Dispose(); + streamReader?.Dispose(); } disposedValue = true; } } - - public void Dispose() - { - Dispose(true); - GC.SuppressFinalize(this); - } } } diff --git a/Readers/ReaderFactory.cs b/Readers/ReaderFactory.cs index 2b8a204..0eb36b1 100644 --- a/Readers/ReaderFactory.cs +++ b/Readers/ReaderFactory.cs @@ -12,9 +12,6 @@ public static ISequenceReader Create(CliOptions settings) ReaderType.Fastq => new FastqReader(settings.InputFilename, false), ReaderType.FastqGz => new FastqReader(settings.InputFilename, true), - ReaderType.FastqLine => new FastqLineReader(settings.InputFilename, false), - ReaderType.FastqLineGz => new FastqLineReader(settings.InputFilename, true), - ReaderType.Sam => new SamReader(settings.InputFilename, false), ReaderType.SamGz => new SamReader(settings.InputFilename, true), diff --git a/Readers/ReaderType.cs b/Readers/ReaderType.cs index 19b746a..7638533 100644 --- a/Readers/ReaderType.cs +++ b/Readers/ReaderType.cs @@ -6,10 +6,6 @@ public enum ReaderType FastqGz, - FastqLine, - - FastqLineGz, - Sam, SamGz, diff --git a/Readers/SamReader.cs b/Readers/SamReader.cs index 9e1584f..5357fbc 100644 --- a/Readers/SamReader.cs +++ b/Readers/SamReader.cs @@ -1,73 +1,26 @@ using System; using System.Globalization; using System.IO; -using System.IO.Compression; using System.Text; using static Ovation.FasterQC.Net.Utils.CliOptions; namespace Ovation.FasterQC.Net { - public class SamReader : ISequenceReader + public class SamReader : AbstractReader { - private readonly FileStream inputStream; - - private readonly GZipStream? gzipStream; - - private readonly BufferedStream bufferedStream; - private readonly StreamReader streamReader; private bool disposedValue; - private ulong sequencesRead = 0; - - public ulong SequencesRead => sequencesRead; - public SamReader(string sam, bool gzipped = true) + : base(sam, gzipped) { - var bufferSize = 128 * 1024; - - var fileStreamOptions = new FileStreamOptions() - { - Mode = FileMode.Open, - BufferSize = bufferSize, - }; - - if (gzipped == true) - { - inputStream = File.Open(sam, fileStreamOptions); - gzipStream = new GZipStream(inputStream, CompressionMode.Decompress); - bufferedStream = new BufferedStream(gzipStream, bufferSize); - streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); - } - else - { - inputStream = File.Open(sam, fileStreamOptions); - bufferedStream = new BufferedStream(inputStream, bufferSize); - streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); - } + streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize); ConsumeHeader(); } - private void ConsumeHeader() - { - try - { - while (streamReader.Peek() == '@') - { - var header = streamReader.ReadLine(); - On(Settings.Debug, () => Console.Error.WriteLine(header)); - } - } - catch (EndOfStreamException) - { - // swallow this, we've run out of file and a call - // into ReadSequence will handle the EOF case - } - } - - public bool ReadSequence(out Sequence? sequence) + public override bool ReadSequence(out Sequence? sequence) { try { @@ -108,29 +61,34 @@ public bool ReadSequence(out Sequence? sequence) return false; } - public double ApproximateCompletion => - 100.0 * inputStream.Position / inputStream.Length; + private void ConsumeHeader() + { + try + { + while (streamReader.Peek() == '@') + { + var header = streamReader.ReadLine(); + On(Settings.Debug, () => Console.Error.WriteLine(header)); + } + } + catch (EndOfStreamException) + { + // swallow this, we've run out of file and a call + // into ReadSequence will handle the EOF case + } + } - protected virtual void Dispose(bool disposing) + protected override void Dispose(bool disposing) { if (!disposedValue) { if (disposing) { streamReader?.Dispose(); - bufferedStream?.Dispose(); - gzipStream?.Dispose(); - inputStream?.Dispose(); } disposedValue = true; } } - - public void Dispose() - { - Dispose(true); - GC.SuppressFinalize(this); - } } }