Skip to content

Commit

Permalink
issue-3: rationalize fastq readers
Browse files Browse the repository at this point in the history
  • Loading branch information
michaeljon committed May 27, 2022
1 parent 7274b61 commit 1b94999
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 285 deletions.
70 changes: 70 additions & 0 deletions Readers/AbstractReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
using System;
using System.IO;
using System.IO.Compression;

namespace Ovation.FasterQC.Net
{
public abstract class AbstractReader : ISequenceReader
{
private readonly FileStream inputStream;

private readonly GZipStream? gzipStream;

protected readonly BufferedStream bufferedStream;

protected readonly int bufferSize = 128 * 1024;

private bool disposedValue;

protected ulong sequencesRead = 0;

public ulong SequencesRead => sequencesRead;

public double ApproximateCompletion =>
100.0 * inputStream.Position / inputStream.Length;

public AbstractReader(string filename, bool gzipped = true)
{
var fileStreamOptions = new FileStreamOptions()
{
Mode = FileMode.Open,
BufferSize = bufferSize,
};

if (gzipped == true)
{
inputStream = File.Open(filename, fileStreamOptions);
gzipStream = new GZipStream(inputStream, CompressionMode.Decompress);
bufferedStream = new BufferedStream(gzipStream, bufferSize);
}
else
{
inputStream = File.Open(filename, fileStreamOptions);
bufferedStream = new BufferedStream(inputStream, bufferSize);
}
}

public abstract bool ReadSequence(out Sequence? sequence);

protected virtual void Dispose(bool disposing)
{
if (!disposedValue)
{
if (disposing)
{
bufferedStream?.Dispose();
gzipStream?.Dispose();
inputStream?.Dispose();
}

disposedValue = true;
}
}

public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
}
}
43 changes: 5 additions & 38 deletions Readers/BamReader.cs
Original file line number Diff line number Diff line change
@@ -1,46 +1,25 @@
using System;
using System.IO;
using System.IO.Compression;
using System.Text;
using static Ovation.FasterQC.Net.Utils.CliOptions;

namespace Ovation.FasterQC.Net
{
public class BamReader : ISequenceReader
public class BamReader : AbstractReader
{
private readonly FileStream inputStream;

private readonly GZipStream gzipStream;

private readonly BufferedStream bufferedStream;

private readonly BinaryReader binaryReader;

private bool disposedValue;

private ulong sequencesRead = 0;

public ulong SequencesRead => sequencesRead;

public BamReader(string bam)
public BamReader(string bam) :
base(bam, false)
{
var bufferSize = 128 * 1024;

var fileStreamOptions = new FileStreamOptions()
{
Mode = FileMode.Open,
BufferSize = bufferSize,
};

inputStream = File.Open(bam, fileStreamOptions);
gzipStream = new GZipStream(inputStream, CompressionMode.Decompress);
bufferedStream = new BufferedStream(gzipStream, bufferSize);
binaryReader = new BinaryReader(bufferedStream);

ConsumeHeader();
}

public bool ReadSequence(out Sequence? sequence)
public override bool ReadSequence(out Sequence? sequence)
{
try
{
Expand All @@ -57,9 +36,6 @@ public bool ReadSequence(out Sequence? sequence)
}
}

public double ApproximateCompletion =>
100.0 * inputStream.Position / inputStream.Length;

private void ConsumeHeader()
{
var magic = binaryReader.ReadBytes(4);
Expand Down Expand Up @@ -174,26 +150,17 @@ private BamAlignment ReadSequence()
return bamAlignment;
}

protected virtual void Dispose(bool disposing)
protected override void Dispose(bool disposing)
{
if (!disposedValue)
{
if (disposing)
{
binaryReader?.Dispose();
bufferedStream?.Dispose();
gzipStream?.Dispose();
inputStream?.Dispose();
}

disposedValue = true;
}
}

public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
}
}
103 changes: 0 additions & 103 deletions Readers/FastqLineReader.cs

This file was deleted.

92 changes: 18 additions & 74 deletions Readers/FastqReader.cs
Original file line number Diff line number Diff line change
@@ -1,86 +1,39 @@
using System;
using System.IO;
using System.IO.Compression;
using System.Text;
using static Ovation.FasterQC.Net.Utils.CliOptions;

namespace Ovation.FasterQC.Net
{
public class FastqReader : ISequenceReader
public class FastqReader : AbstractReader
{
private readonly FileStream inputStream;

private readonly GZipStream? gzipStream;

private readonly BufferedStream bufferedStream;

private readonly BinaryReader binaryReader;
protected readonly StreamReader streamReader;

private bool disposedValue;

private ulong sequencesRead = 0;

public ulong SequencesRead => sequencesRead;

public double ApproximateCompletion =>
100.0 * inputStream.Position / inputStream.Length;

public FastqReader(string fastq, bool gzipped = true)
public FastqReader(string fastq, bool gzipped)
: base(fastq, gzipped)
{
var bufferSize = 128 * 1024;

var fileStreamOptions = new FileStreamOptions()
{
Mode = FileMode.Open,
BufferSize = bufferSize,
};

if (gzipped == true)
{
inputStream = File.Open(fastq, fileStreamOptions);
gzipStream = new GZipStream(inputStream, CompressionMode.Decompress);
bufferedStream = new BufferedStream(gzipStream, bufferSize);
binaryReader = new BinaryReader(bufferedStream);
}
else
{
inputStream = File.Open(fastq, fileStreamOptions);
bufferedStream = new BufferedStream(inputStream, bufferSize);
binaryReader = new BinaryReader(bufferedStream);
}
streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize);
}

public bool ReadSequence(out Sequence? sequence)
public override bool ReadSequence(out Sequence? sequence)
{
// this is clearly dangerous, instead read a large chunk of the file
// and then walk through it returning only the consumed portion while
// keeping track of the last byte consumed on the stream
byte[] bytes = new byte[1024];

int offset = 0;
int line = 0;
int[] endOfLines = new int[4];

try
{
while (line < 4)
if (streamReader.EndOfStream == true)
{
var b = binaryReader.ReadByte();
if (b == (byte)'\n')
{
endOfLines[line++] = offset;
}
else
{
bytes[offset++] = b;
}
On(Settings.Verbose, () => Console.Error.WriteLine("End of stream"));
sequence = null;
return false;
}

for (var read = endOfLines[1]; read < endOfLines[2]; read++)
{
bytes[read] &= 0xdf;
}
var identifier = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
var read = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");

sequence = new Sequence(bytes, endOfLines);
sequence = new Sequence(0, identifier, read, blank, quality);
sequencesRead++;
return true;
}
Expand All @@ -92,26 +45,17 @@ public bool ReadSequence(out Sequence? sequence)
}
}

protected virtual void Dispose(bool disposing)
protected override void Dispose(bool disposing)
{
if (!disposedValue)
{
if (disposing)
{
binaryReader?.Dispose();
bufferedStream?.Dispose();
gzipStream?.Dispose();
inputStream?.Dispose();
streamReader?.Dispose();
}

disposedValue = true;
}
}

public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
}
}
Loading

0 comments on commit 1b94999

Please sign in to comment.