Skip to content

Commit

Permalink
issue-8: adding samreader
Browse files Browse the repository at this point in the history
  • Loading branch information
michaeljon committed May 18, 2022
1 parent 4e0b684 commit 19bbc09
Show file tree
Hide file tree
Showing 9 changed files with 261 additions and 45 deletions.
15 changes: 13 additions & 2 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,23 @@
// If you have changed target frameworks, make sure to update the program path.
"program": "${workspaceFolder}/bin/Debug/net6.0/Ovation.FasterQC.Net.dll",
"args": [
"-p", "-i", "/tmp/zr6254_1/zr6254_1.sorted.bam", "-o", "/tmp/bob.json", "-m", "BasicStatistics", "NCountsAtPosition"
"-v",
"-d",
"-f",
"sam",
"-i",
"./tmp/in3257_2_S1.sorted.sam",
"-o",
"./tmp/bob.json",
"-m",
"BasicStatistics",
"NCountsAtPosition"
],
"cwd": "${workspaceFolder}",
// For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console
"console": "internalConsole",
"stopAtEntry": false
"stopAtEntry": false,
"requireExactSource": false
},
{
"name": ".NET Core Attach",
Expand Down
57 changes: 57 additions & 0 deletions Models/ReadFlag.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
using System;

namespace Ovation.FasterQC.Net
{
[Flags]
public enum ReadFlag : ushort
{
/// <summary>
/// template having multiple templates in sequencing (read is paired)
/// </summary>
Paired = 1,

/// <summary>
/// </summary>
Aligned = 2,

/// <summary>
/// </summary>
SegmentUnmapped = 4,

/// <summary>
/// </summary>
NextSegmentUnmapped = 8,

/// <summary>
/// </summary>
ReverseComplemented = 16,

/// <summary>
/// </summary>
NextSegmentReverseComplemented = 32,

/// <summary>
/// </summary>
FirstSegment = 64,

/// <summary>
/// </summary>
LastSegment = 128,

/// <summary>
/// </summary>
NotPrimaryAlignment = 256,

/// <summary>
/// </summary>
FailedQualityChecks = 512,

/// <summary>
/// </summary>
OpticalDuplicate = 1024,

/// <summary>
/// </summary>
SupplementaryAlignment = 2048
}
}
7 changes: 6 additions & 1 deletion Models/Sequence.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ namespace Ovation.FasterQC.Net
{
public class Sequence
{
public ReadFlag ReadFlag { get; }

public byte[] Identifier { get; }

public byte[] Read { get; }
Expand All @@ -21,8 +23,9 @@ public Sequence(byte[] lines, int[] endOfLines)
Quality = new ReadOnlyMemory<byte>(lines, endOfLines[2], endOfLines[3] - endOfLines[2]).ToArray();
}

public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality)
public Sequence(ushort readFlag, byte[] identifer, byte[] read, byte[] blank, byte[] quality)
{
ReadFlag = (ReadFlag)readFlag;
Identifier = new ReadOnlyMemory<byte>(identifer).ToArray();
Read = new ReadOnlyMemory<byte>(read).ToArray();
Blank = new ReadOnlyMemory<byte>(blank).ToArray();
Expand All @@ -31,6 +34,7 @@ public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality)

public Sequence(BamAlignment bamAlignment)
{
ReadFlag = (ReadFlag)bamAlignment.flag;
Identifier = new ReadOnlyMemory<byte>(bamAlignment.read_name).ToArray();
Read = new ReadOnlyMemory<byte>(bamAlignment.seq).ToArray();
Quality = new ReadOnlyMemory<byte>(bamAlignment.qual).ToArray();
Expand All @@ -41,6 +45,7 @@ public override string ToString()
{
var sb = new StringBuilder("sequence: \n");

sb.AppendLine(ReadFlag.ToString());
sb.AppendLine(new string(Encoding.ASCII.GetChars(Identifier)));
sb.AppendLine(new string(Encoding.ASCII.GetChars(Read)));
sb.AppendLine(new string(Encoding.ASCII.GetChars(Blank)));
Expand Down
20 changes: 14 additions & 6 deletions Program.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.Json;
using CommandLine;
using Ovation.FasterQC.Net.Modules;
Expand All @@ -23,13 +24,20 @@ class Program

static void Main(string[] args)
{
Parser.Default.ParseArguments<CliOptions>(args)
.WithParsed(o =>
var parser = new Parser(config =>
{
o.Validate();
Settings = o;
new Program().Run();
});
config.AutoHelp = true;
config.AutoVersion = true;
config.CaseInsensitiveEnumValues = true;
}
);

parser.ParseArguments<CliOptions>(args)
.WithParsed(o =>
{
Settings = o;
new Program().Run();
});
}

private void Run()
Expand Down
2 changes: 1 addition & 1 deletion Readers/FastqLineReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public bool ReadSequence(out Sequence? sequence)
var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");

sequence = new Sequence(identifier, read, blank, quality);
sequence = new Sequence(0, identifier, read, blank, quality);
sequencesRead++;
return true;
}
Expand Down
15 changes: 12 additions & 3 deletions Readers/ReaderFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,19 @@ public static class ReaderFactory
{
public static ISequenceReader Create(CliOptions settings)
{
return settings switch
return settings.Format switch
{
{ Fastq: true } => new FastqLineReader(settings.InputFilename, settings.Zipped),
{ Bam: true } => new BamReader(settings.InputFilename),
ReaderType.Fastq => new FastqReader(settings.InputFilename, false),
ReaderType.FastqGz => new FastqReader(settings.InputFilename, true),

ReaderType.FastqLine => new FastqLineReader(settings.InputFilename, false),
ReaderType.FastqLineGz => new FastqLineReader(settings.InputFilename, true),

ReaderType.Sam => new SamReader(settings.InputFilename, false),
ReaderType.SamGz => new SamReader(settings.InputFilename, true),

ReaderType.Bam => new BamReader(settings.InputFilename),

_ => throw new InvalidOperationException($"could not determine file type of {settings.InputFilename}")
};
}
Expand Down
19 changes: 19 additions & 0 deletions Readers/ReaderType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
namespace Ovation.FasterQC.Net
{
public enum ReaderType
{
Fastq,

FastqGz,

FastqLine,

FastqLineGz,

Sam,

SamGz,

Bam
}
}
136 changes: 136 additions & 0 deletions Readers/SamReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
using System;
using System.Globalization;
using System.IO;
using System.IO.Compression;
using System.Text;
using static Ovation.FasterQC.Net.Utils.CliOptions;

namespace Ovation.FasterQC.Net
{
public class SamReader : ISequenceReader
{
private readonly FileStream inputStream;

private readonly GZipStream? gzipStream;

private readonly BufferedStream bufferedStream;

private readonly StreamReader streamReader;

private bool disposedValue;

private int sequencesRead = 0;

public int SequencesRead => sequencesRead;

public SamReader(string sam, bool gzipped = true)
{
var bufferSize = 128 * 1024;

var fileStreamOptions = new FileStreamOptions()
{
Mode = FileMode.Open,
BufferSize = bufferSize,
};

if (gzipped == true)
{
inputStream = File.Open(sam, fileStreamOptions);
gzipStream = new GZipStream(inputStream, CompressionMode.Decompress);
bufferedStream = new BufferedStream(gzipStream, bufferSize);
streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize);
}
else
{
inputStream = File.Open(sam, fileStreamOptions);
bufferedStream = new BufferedStream(inputStream, bufferSize);
streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize);
}

ConsumeHeader();
}

private void ConsumeHeader()
{
try
{
while (streamReader.Peek() == '@')
{
var header = streamReader.ReadLine();
On(Settings.Debug, () => Console.Error.WriteLine(header));
}
}
catch (EndOfStreamException)
{
// swallow this, we've run out of file and a call
// into ReadSequence will handle the EOF case
}
}

public bool ReadSequence(out Sequence? sequence)
{
try
{
if (streamReader.EndOfStream == true)
{
goto endofstream;
}

var entry = streamReader.ReadLine();
if (entry == null)
{
goto endofstream;
}

// this is clearly a bad approach, we're going to be allocating a
// ton of small strings here, probably better to read the line,
// find the tabs ourselves, then pull the bytes out of the components
var parts = entry.Split('\t', StringSplitOptions.TrimEntries);

var identifier = Encoding.ASCII.GetBytes(parts[0]);
var flag = ushort.Parse(parts[1], CultureInfo.InvariantCulture);
var read = Encoding.ASCII.GetBytes(parts[9]);
var blank = Encoding.ASCII.GetBytes("");
var quality = Encoding.ASCII.GetBytes(parts[10]);

sequence = new Sequence(flag, identifier, read, blank, quality);
sequencesRead++;
return true;
}
catch (EndOfStreamException)
{
goto endofstream;
}

endofstream:
On(Settings.Verbose, () => Console.Error.WriteLine("End of stream"));
sequence = null;
return false;
}

public double ApproximateCompletion =>
100.0 * inputStream.Position / inputStream.Length;

protected virtual void Dispose(bool disposing)
{
if (!disposedValue)
{
if (disposing)
{
streamReader?.Dispose();
bufferedStream?.Dispose();
gzipStream?.Dispose();
inputStream?.Dispose();
}

disposedValue = true;
}
}

public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
}
}
Loading

0 comments on commit 19bbc09

Please sign in to comment.