diff --git a/.vscode/launch.json b/.vscode/launch.json
index d62fcf7..5315e07 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -12,12 +12,23 @@
// If you have changed target frameworks, make sure to update the program path.
"program": "${workspaceFolder}/bin/Debug/net6.0/Ovation.FasterQC.Net.dll",
"args": [
- "-p", "-i", "/tmp/zr6254_1/zr6254_1.sorted.bam", "-o", "/tmp/bob.json", "-m", "BasicStatistics", "NCountsAtPosition"
+ "-v",
+ "-d",
+ "-f",
+ "sam",
+ "-i",
+ "./tmp/in3257_2_S1.sorted.sam",
+ "-o",
+ "./tmp/bob.json",
+ "-m",
+ "BasicStatistics",
+ "NCountsAtPosition"
],
"cwd": "${workspaceFolder}",
// For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console
"console": "internalConsole",
- "stopAtEntry": false
+ "stopAtEntry": false,
+ "requireExactSource": false
},
{
"name": ".NET Core Attach",
diff --git a/Interfaces/ISequenceReader.cs b/Interfaces/ISequenceReader.cs
index fe37083..954aa5a 100644
--- a/Interfaces/ISequenceReader.cs
+++ b/Interfaces/ISequenceReader.cs
@@ -6,7 +6,7 @@ public interface ISequenceReader : IDisposable
{
int SequencesRead { get; }
- bool ReadSequence(out Sequence sequence);
+ bool ReadSequence(out Sequence? sequence);
double ApproximateCompletion { get; }
}
diff --git a/Models/BamAlignment.cs b/Models/BamAlignment.cs
index 23dda7e..28cdc0f 100644
--- a/Models/BamAlignment.cs
+++ b/Models/BamAlignment.cs
@@ -1,6 +1,8 @@
+using System.Diagnostics.CodeAnalysis;
+
namespace Ovation.FasterQC.Net
{
-#pragma warning disable IDE1006
+ [SuppressMessage("Code style", "IDE1006", Justification = "Names correspond to BAM structure field names")]
public class BamAlignment
{
public uint block_size { get; set; }
@@ -27,13 +29,12 @@ public class BamAlignment
public int tlen { get; set; }
- public byte[] read_name { get; set; }
+ public byte[] read_name { get; set; } = null!;
- public uint[] cigar { get; set; }
+ public uint[] cigar { get; set; } = null!;
- public byte[] seq { get; set; }
+ public byte[] seq { get; set; } = null!;
- public byte[] qual { get; set; }
+ public byte[] qual { get; set; } = null!;
}
-#pragma warning restore IDE1006
-}
\ No newline at end of file
+}
diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs
new file mode 100644
index 0000000..5af2fae
--- /dev/null
+++ b/Models/ReadFlag.cs
@@ -0,0 +1,68 @@
+using System;
+
+namespace Ovation.FasterQC.Net
+{
+ [Flags]
+ public enum ReadFlag : ushort
+ {
+ ///
+ /// template having multiple templates in sequencing (read is paired)
+ ///
+ Paired = 1,
+
+ ///
+ /// each segment properly aligned according to the aligner (read mapped in proper pair)
+ ///
+ Aligned = 2,
+
+ ///
+ /// segment unmapped (read1 unmapped)
+ ///
+ SegmentUnmapped = 4,
+
+ ///
+ /// next segment in the template unmapped (read2 unmapped)
+ ///
+ NextSegmentUnmapped = 8,
+
+ ///
+ /// SEQ being reverse complemented (read1 reverse complemented)
+ ///
+ ReverseComplemented = 16,
+
+ ///
+ /// SEQ of the next segment in the template being reverse complemented (read2 reverse complemented)
+ ///
+ NextSegmentReverseComplemented = 32,
+
+ ///
+ /// the first segment in the template (is read1)
+ ///
+ FirstSegment = 64,
+
+ ///
+ /// the last segment in the template (is read2)
+ ///
+ LastSegment = 128,
+
+ ///
+ /// not primary alignment
+ ///
+ NotPrimaryAlignment = 256,
+
+ ///
+ /// alignment fails quality checks
+ ///
+ FailedQualityChecks = 512,
+
+ ///
+ /// PCR or optical duplicate
+ ///
+ OpticalDuplicate = 1024,
+
+ ///
+ /// supplementary alignment (e.g. aligner specific, could be a portion of a split read or a tied region)
+ ///
+ SupplementaryAlignment = 2048
+ }
+}
\ No newline at end of file
diff --git a/Models/Sequence.cs b/Models/Sequence.cs
index e9eff94..b4bd196 100644
--- a/Models/Sequence.cs
+++ b/Models/Sequence.cs
@@ -5,6 +5,8 @@ namespace Ovation.FasterQC.Net
{
public class Sequence
{
+ public ReadFlag ReadFlag { get; }
+
public byte[] Identifier { get; }
public byte[] Read { get; }
@@ -21,8 +23,9 @@ public Sequence(byte[] lines, int[] endOfLines)
Quality = new ReadOnlyMemory(lines, endOfLines[2], endOfLines[3] - endOfLines[2]).ToArray();
}
- public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality)
+ public Sequence(ushort readFlag, byte[] identifer, byte[] read, byte[] blank, byte[] quality)
{
+ ReadFlag = (ReadFlag)readFlag;
Identifier = new ReadOnlyMemory(identifer).ToArray();
Read = new ReadOnlyMemory(read).ToArray();
Blank = new ReadOnlyMemory(blank).ToArray();
@@ -31,15 +34,18 @@ public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality)
public Sequence(BamAlignment bamAlignment)
{
+ ReadFlag = (ReadFlag)bamAlignment.flag;
Identifier = new ReadOnlyMemory(bamAlignment.read_name).ToArray();
Read = new ReadOnlyMemory(bamAlignment.seq).ToArray();
Quality = new ReadOnlyMemory(bamAlignment.qual).ToArray();
+ Blank = Array.Empty();
}
public override string ToString()
{
var sb = new StringBuilder("sequence: \n");
+ sb.AppendLine(ReadFlag.ToString());
sb.AppendLine(new string(Encoding.ASCII.GetChars(Identifier)));
sb.AppendLine(new string(Encoding.ASCII.GetChars(Read)));
sb.AppendLine(new string(Encoding.ASCII.GetChars(Blank)));
@@ -48,4 +54,4 @@ public override string ToString()
return sb.ToString();
}
}
-}
\ No newline at end of file
+}
diff --git a/Ovation.FasterQC.Net.csproj b/Ovation.FasterQC.Net.csproj
index 9eb1d86..77ff3f2 100644
--- a/Ovation.FasterQC.Net.csproj
+++ b/Ovation.FasterQC.Net.csproj
@@ -3,6 +3,7 @@
Exe
net6.0
+ enable
diff --git a/Program.cs b/Program.cs
index 527619b..a502ffe 100644
--- a/Program.cs
+++ b/Program.cs
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.IO;
+using System.Linq;
using System.Text.Json;
using CommandLine;
using Ovation.FasterQC.Net.Modules;
@@ -19,17 +20,24 @@ class Program
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
};
- private TimedSequenceProgressBar progressBar;
+ private TimedSequenceProgressBar? progressBar;
static void Main(string[] args)
{
- Parser.Default.ParseArguments(args)
- .WithParsed(o =>
+ var parser = new Parser(config =>
{
- o.Validate();
- Settings = o;
- new Program().Run();
- });
+ config.AutoHelp = true;
+ config.AutoVersion = true;
+ config.CaseInsensitiveEnumValues = true;
+ }
+ );
+
+ parser.ParseArguments(args)
+ .WithParsed(o =>
+ {
+ Settings = o;
+ new Program().Run();
+ });
}
private void Run()
@@ -42,14 +50,16 @@ private void Run()
On(Settings.ShowProgress, () => progressBar = new TimedSequenceProgressBar(sequenceReader));
On(Settings.Verbose, () => Console.Error.WriteLine($"Processing {Settings.InputFilename}..."));
- while (sequenceReader.ReadSequence(out Sequence sequence))
+ while (sequenceReader.ReadSequence(out Sequence? sequence))
{
+ ArgumentNullException.ThrowIfNull(sequence);
+
foreach (var module in modules)
{
module.ProcessSequence(sequence);
}
- On(Settings.ShowProgress, () => progressBar.Update());
+ On(Settings.ShowProgress, () => progressBar?.Update());
On(Settings.Verbose, () =>
{
if (sequenceReader.SequencesRead % UpdatePeriod == 0)
@@ -71,7 +81,7 @@ private void Run()
results[module.Name] = module.Data;
}
- On(Settings.ShowProgress, () => progressBar.Update(force: true));
+ On(Settings.ShowProgress, () => progressBar?.Update(force: true));
On(Settings.Verbose, () => Console.Error.WriteLine($"{sequenceReader.SequencesRead.WithSsiUnits()} sequences completed ({sequenceReader.ApproximateCompletion:0.0}%)"));
if (string.IsNullOrWhiteSpace(Settings.OutputFilename))
diff --git a/Readers/BamReader.cs b/Readers/BamReader.cs
index a2d9d4c..5c6f5a0 100644
--- a/Readers/BamReader.cs
+++ b/Readers/BamReader.cs
@@ -40,7 +40,7 @@ public BamReader(string bam)
ConsumeHeader();
}
- public bool ReadSequence(out Sequence sequence)
+ public bool ReadSequence(out Sequence? sequence)
{
try
{
@@ -106,9 +106,10 @@ private BamAlignment ReadSequence()
var bamAlignment = new BamAlignment
{
block_size = block_size,
+ refID = BitConverter.ToInt32(block, offset)
};
- bamAlignment.refID = BitConverter.ToInt32(block, offset); offset += 4;
+ offset += 4;
bamAlignment.pos = BitConverter.ToInt32(block, offset) + 1; offset += 4;
bamAlignment.l_read_name = block[offset]; offset += 1;
bamAlignment.mapq = block[offset]; offset += 1;
diff --git a/Readers/FastqLineReader.cs b/Readers/FastqLineReader.cs
index de7611b..07efeac 100644
--- a/Readers/FastqLineReader.cs
+++ b/Readers/FastqLineReader.cs
@@ -10,7 +10,7 @@ public class FastqLineReader : ISequenceReader
{
private readonly FileStream inputStream;
- private readonly GZipStream gzipStream;
+ private readonly GZipStream? gzipStream;
private readonly BufferedStream bufferedStream;
@@ -50,7 +50,7 @@ public FastqLineReader(string fastq, bool gzipped = true)
}
}
- public bool ReadSequence(out Sequence sequence)
+ public bool ReadSequence(out Sequence? sequence)
{
try
{
@@ -61,12 +61,12 @@ public bool ReadSequence(out Sequence sequence)
return false;
}
- var identifier = Encoding.ASCII.GetBytes(streamReader.ReadLine());
- var read = Encoding.ASCII.GetBytes(streamReader.ReadLine());
- var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine());
- var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine());
+ var identifier = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
+ var read = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
+ var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
+ var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
- sequence = new Sequence(identifier, read, blank, quality);
+ sequence = new Sequence(0, identifier, read, blank, quality);
sequencesRead++;
return true;
}
diff --git a/Readers/FastqReader.cs b/Readers/FastqReader.cs
index 946ee2b..bbdca57 100644
--- a/Readers/FastqReader.cs
+++ b/Readers/FastqReader.cs
@@ -9,7 +9,7 @@ public class FastqReader : ISequenceReader
{
private readonly FileStream inputStream;
- private readonly GZipStream gzipStream;
+ private readonly GZipStream? gzipStream;
private readonly BufferedStream bufferedStream;
@@ -49,7 +49,7 @@ public FastqReader(string fastq, bool gzipped = true)
}
}
- public bool ReadSequence(out Sequence sequence)
+ public bool ReadSequence(out Sequence? sequence)
{
// this is clearly dangerous, instead read a large chunk of the file
// and then walk through it returning only the consumed portion while
diff --git a/Readers/ReaderFactory.cs b/Readers/ReaderFactory.cs
index c8c34bf..2b8a204 100644
--- a/Readers/ReaderFactory.cs
+++ b/Readers/ReaderFactory.cs
@@ -7,10 +7,19 @@ public static class ReaderFactory
{
public static ISequenceReader Create(CliOptions settings)
{
- return settings switch
+ return settings.Format switch
{
- { Fastq: true } => new FastqLineReader(settings.InputFilename, true),
- { Bam: true } => new BamReader(settings.InputFilename),
+ ReaderType.Fastq => new FastqReader(settings.InputFilename, false),
+ ReaderType.FastqGz => new FastqReader(settings.InputFilename, true),
+
+ ReaderType.FastqLine => new FastqLineReader(settings.InputFilename, false),
+ ReaderType.FastqLineGz => new FastqLineReader(settings.InputFilename, true),
+
+ ReaderType.Sam => new SamReader(settings.InputFilename, false),
+ ReaderType.SamGz => new SamReader(settings.InputFilename, true),
+
+ ReaderType.Bam => new BamReader(settings.InputFilename),
+
_ => throw new InvalidOperationException($"could not determine file type of {settings.InputFilename}")
};
}
diff --git a/Readers/ReaderType.cs b/Readers/ReaderType.cs
new file mode 100644
index 0000000..19b746a
--- /dev/null
+++ b/Readers/ReaderType.cs
@@ -0,0 +1,19 @@
+namespace Ovation.FasterQC.Net
+{
+ public enum ReaderType
+ {
+ Fastq,
+
+ FastqGz,
+
+ FastqLine,
+
+ FastqLineGz,
+
+ Sam,
+
+ SamGz,
+
+ Bam
+ }
+}
\ No newline at end of file
diff --git a/Readers/SamReader.cs b/Readers/SamReader.cs
new file mode 100644
index 0000000..ec20c03
--- /dev/null
+++ b/Readers/SamReader.cs
@@ -0,0 +1,136 @@
+using System;
+using System.Globalization;
+using System.IO;
+using System.IO.Compression;
+using System.Text;
+using static Ovation.FasterQC.Net.Utils.CliOptions;
+
+namespace Ovation.FasterQC.Net
+{
+ public class SamReader : ISequenceReader
+ {
+ private readonly FileStream inputStream;
+
+ private readonly GZipStream? gzipStream;
+
+ private readonly BufferedStream bufferedStream;
+
+ private readonly StreamReader streamReader;
+
+ private bool disposedValue;
+
+ private int sequencesRead = 0;
+
+ public int SequencesRead => sequencesRead;
+
+ public SamReader(string sam, bool gzipped = true)
+ {
+ var bufferSize = 128 * 1024;
+
+ var fileStreamOptions = new FileStreamOptions()
+ {
+ Mode = FileMode.Open,
+ BufferSize = bufferSize,
+ };
+
+ if (gzipped == true)
+ {
+ inputStream = File.Open(sam, fileStreamOptions);
+ gzipStream = new GZipStream(inputStream, CompressionMode.Decompress);
+ bufferedStream = new BufferedStream(gzipStream, bufferSize);
+ streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize);
+ }
+ else
+ {
+ inputStream = File.Open(sam, fileStreamOptions);
+ bufferedStream = new BufferedStream(inputStream, bufferSize);
+ streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize);
+ }
+
+ ConsumeHeader();
+ }
+
+ private void ConsumeHeader()
+ {
+ try
+ {
+ while (streamReader.Peek() == '@')
+ {
+ var header = streamReader.ReadLine();
+ On(Settings.Debug, () => Console.Error.WriteLine(header));
+ }
+ }
+ catch (EndOfStreamException)
+ {
+ // swallow this, we've run out of file and a call
+ // into ReadSequence will handle the EOF case
+ }
+ }
+
+ public bool ReadSequence(out Sequence? sequence)
+ {
+ try
+ {
+ if (streamReader.EndOfStream == true)
+ {
+ goto endofstream;
+ }
+
+ var entry = streamReader.ReadLine();
+ if (entry == null)
+ {
+ goto endofstream;
+ }
+
+ // this is clearly a bad approach, we're going to be allocating a
+ // ton of small strings here, probably better to read the line,
+ // find the tabs ourselves, then pull the bytes out of the components
+ var parts = entry.Split('\t', StringSplitOptions.TrimEntries);
+
+ var identifier = Encoding.ASCII.GetBytes(parts[0]);
+ var flag = ushort.Parse(parts[1], CultureInfo.InvariantCulture);
+ var read = Encoding.ASCII.GetBytes(parts[9]);
+ var blank = Encoding.ASCII.GetBytes("");
+ var quality = Encoding.ASCII.GetBytes(parts[10]);
+
+ sequence = new Sequence(flag, identifier, read, blank, quality);
+ sequencesRead++;
+ return true;
+ }
+ catch (EndOfStreamException)
+ {
+ goto endofstream;
+ }
+
+ endofstream:
+ On(Settings.Verbose, () => Console.Error.WriteLine("End of stream"));
+ sequence = null;
+ return false;
+ }
+
+ public double ApproximateCompletion =>
+ 100.0 * inputStream.Position / inputStream.Length;
+
+ protected virtual void Dispose(bool disposing)
+ {
+ if (!disposedValue)
+ {
+ if (disposing)
+ {
+ streamReader?.Dispose();
+ bufferedStream?.Dispose();
+ gzipStream?.Dispose();
+ inputStream?.Dispose();
+ }
+
+ disposedValue = true;
+ }
+ }
+
+ public void Dispose()
+ {
+ Dispose(true);
+ GC.SuppressFinalize(this);
+ }
+ }
+}
diff --git a/Utils/CliOptions.cs b/Utils/CliOptions.cs
index 3c6422b..a5639c2 100644
--- a/Utils/CliOptions.cs
+++ b/Utils/CliOptions.cs
@@ -1,6 +1,5 @@
using System;
using System.Collections.Generic;
-using System.Globalization;
using CommandLine;
namespace Ovation.FasterQC.Net.Utils
@@ -10,32 +9,25 @@ public class CliOptions
[Option('v', "verbose", Required = false, SetName = "Verbose", HelpText = "Set output to verbose messages.")]
public bool Verbose { get; set; }
- [Option("debug", Required = false, SetName = "Verbose", HelpText = "Show diagnostic output. Can only use with --verbose.")]
+ [Option('d', "debug", Required = false, SetName = "Verbose", HelpText = "Show diagnostic output. Can only use with --verbose.")]
public bool Debug { get; set; }
[Option('p', "progress", Required = false, SetName = "Progress", HelpText = "Show progress bar. Cannnot use with --verbose.")]
public bool ShowProgress { get; set; }
[Option('i', "input", Required = true, HelpText = "Input filename.")]
- public string InputFilename { get; set; }
+ public string InputFilename { get; set; } = null!;
[Option('o', "output", Required = false, HelpText = "Output filename. Defaults to STDOUT.")]
- public string OutputFilename { get; set; }
+ public string OutputFilename { get; set; } = null!;
- [Option('b', "bam", Required = false, HelpText = "Assume BAM format.")]
- public bool Bam { get; set; }
-
- [Option('f', "fastq", Required = false, HelpText = "Assume FASTQ format.")]
- public bool Fastq { get; set; }
-
- [Option('z', "zipped", Required = false, HelpText = "Assume input file is gzipped.")]
- public bool Zipped { get; set; }
+ [Option('f', "format", Required = true, HelpText = "Type of input file.")]
+ public ReaderType Format { get; set; }
[Option('m', "modules", Required = false, HelpText = "Space-separated list of modules to run, or 'all'.")]
public IEnumerable ModuleNames { get; set; }
- public static CliOptions Settings
- { get; set; }
+ public static CliOptions Settings { get; set; } = null!;
public const int UpdatePeriod = 100_000;
@@ -46,27 +38,5 @@ public static void On(bool condition, Action action)
action();
}
}
-
- public bool Validate()
- {
- if (!(Bam || Fastq))
- {
- Fastq = InputFilename.EndsWith(".fastq", ignoreCase: true, culture: CultureInfo.InvariantCulture)
- || InputFilename.EndsWith(".fastq.gz", ignoreCase: true, culture: CultureInfo.InvariantCulture);
-
- Bam = InputFilename.EndsWith(".bam", ignoreCase: true, culture: CultureInfo.InvariantCulture)
- || InputFilename.EndsWith(".bam.gz", ignoreCase: true, culture: CultureInfo.InvariantCulture);
- }
-
- if (Zipped == false && !string.IsNullOrWhiteSpace(InputFilename))
- {
- if (InputFilename.EndsWith(".gz", ignoreCase: true, culture: CultureInfo.InvariantCulture))
- {
- Zipped = true;
- }
- }
-
- return Fastq || Bam;
- }
}
}