diff --git a/.vscode/launch.json b/.vscode/launch.json
index d62fcf7..5315e07 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -12,12 +12,23 @@
// If you have changed target frameworks, make sure to update the program path.
"program": "${workspaceFolder}/bin/Debug/net6.0/Ovation.FasterQC.Net.dll",
"args": [
- "-p", "-i", "/tmp/zr6254_1/zr6254_1.sorted.bam", "-o", "/tmp/bob.json", "-m", "BasicStatistics", "NCountsAtPosition"
+ "-v",
+ "-d",
+ "-f",
+ "sam",
+ "-i",
+ "./tmp/in3257_2_S1.sorted.sam",
+ "-o",
+ "./tmp/bob.json",
+ "-m",
+ "BasicStatistics",
+ "NCountsAtPosition"
],
"cwd": "${workspaceFolder}",
// For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console
"console": "internalConsole",
- "stopAtEntry": false
+ "stopAtEntry": false,
+ "requireExactSource": false
},
{
"name": ".NET Core Attach",
diff --git a/Models/ReadFlag.cs b/Models/ReadFlag.cs
new file mode 100644
index 0000000..5af2fae
--- /dev/null
+++ b/Models/ReadFlag.cs
@@ -0,0 +1,68 @@
+using System;
+
+namespace Ovation.FasterQC.Net
+{
+ [Flags]
+ public enum ReadFlag : ushort
+ {
+ ///
+ /// template having multiple templates in sequencing (read is paired)
+ ///
+ Paired = 1,
+
+ ///
+ /// each segment properly aligned according to the aligner (read mapped in proper pair)
+ ///
+ Aligned = 2,
+
+ ///
+ /// segment unmapped (read1 unmapped)
+ ///
+ SegmentUnmapped = 4,
+
+ ///
+ /// next segment in the template unmapped (read2 unmapped)
+ ///
+ NextSegmentUnmapped = 8,
+
+ ///
+ /// SEQ being reverse complemented (read1 reverse complemented)
+ ///
+ ReverseComplemented = 16,
+
+ ///
+ /// SEQ of the next segment in the template being reverse complemented (read2 reverse complemented)
+ ///
+ NextSegmentReverseComplemented = 32,
+
+ ///
+ /// the first segment in the template (is read1)
+ ///
+ FirstSegment = 64,
+
+ ///
+ /// the last segment in the template (is read2)
+ ///
+ LastSegment = 128,
+
+ ///
+ /// not primary alignment
+ ///
+ NotPrimaryAlignment = 256,
+
+ ///
+ /// alignment fails quality checks
+ ///
+ FailedQualityChecks = 512,
+
+ ///
+ /// PCR or optical duplicate
+ ///
+ OpticalDuplicate = 1024,
+
+ ///
+ /// supplementary alignment (e.g. aligner specific, could be a portion of a split read or a tied region)
+ ///
+ SupplementaryAlignment = 2048
+ }
+}
\ No newline at end of file
diff --git a/Models/Sequence.cs b/Models/Sequence.cs
index 3c63b88..b4bd196 100644
--- a/Models/Sequence.cs
+++ b/Models/Sequence.cs
@@ -5,6 +5,8 @@ namespace Ovation.FasterQC.Net
{
public class Sequence
{
+ public ReadFlag ReadFlag { get; }
+
public byte[] Identifier { get; }
public byte[] Read { get; }
@@ -21,8 +23,9 @@ public Sequence(byte[] lines, int[] endOfLines)
Quality = new ReadOnlyMemory(lines, endOfLines[2], endOfLines[3] - endOfLines[2]).ToArray();
}
- public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality)
+ public Sequence(ushort readFlag, byte[] identifer, byte[] read, byte[] blank, byte[] quality)
{
+ ReadFlag = (ReadFlag)readFlag;
Identifier = new ReadOnlyMemory(identifer).ToArray();
Read = new ReadOnlyMemory(read).ToArray();
Blank = new ReadOnlyMemory(blank).ToArray();
@@ -31,6 +34,7 @@ public Sequence(byte[] identifer, byte[] read, byte[] blank, byte[] quality)
public Sequence(BamAlignment bamAlignment)
{
+ ReadFlag = (ReadFlag)bamAlignment.flag;
Identifier = new ReadOnlyMemory(bamAlignment.read_name).ToArray();
Read = new ReadOnlyMemory(bamAlignment.seq).ToArray();
Quality = new ReadOnlyMemory(bamAlignment.qual).ToArray();
@@ -41,6 +45,7 @@ public override string ToString()
{
var sb = new StringBuilder("sequence: \n");
+ sb.AppendLine(ReadFlag.ToString());
sb.AppendLine(new string(Encoding.ASCII.GetChars(Identifier)));
sb.AppendLine(new string(Encoding.ASCII.GetChars(Read)));
sb.AppendLine(new string(Encoding.ASCII.GetChars(Blank)));
diff --git a/Program.cs b/Program.cs
index fb15b01..a502ffe 100644
--- a/Program.cs
+++ b/Program.cs
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.IO;
+using System.Linq;
using System.Text.Json;
using CommandLine;
using Ovation.FasterQC.Net.Modules;
@@ -23,13 +24,20 @@ class Program
static void Main(string[] args)
{
- Parser.Default.ParseArguments(args)
- .WithParsed(o =>
+ var parser = new Parser(config =>
{
- o.Validate();
- Settings = o;
- new Program().Run();
- });
+ config.AutoHelp = true;
+ config.AutoVersion = true;
+ config.CaseInsensitiveEnumValues = true;
+ }
+ );
+
+ parser.ParseArguments(args)
+ .WithParsed(o =>
+ {
+ Settings = o;
+ new Program().Run();
+ });
}
private void Run()
diff --git a/Readers/FastqLineReader.cs b/Readers/FastqLineReader.cs
index 9c2c264..07efeac 100644
--- a/Readers/FastqLineReader.cs
+++ b/Readers/FastqLineReader.cs
@@ -66,7 +66,7 @@ public bool ReadSequence(out Sequence? sequence)
var blank = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
var quality = Encoding.ASCII.GetBytes(streamReader.ReadLine() ?? "");
- sequence = new Sequence(identifier, read, blank, quality);
+ sequence = new Sequence(0, identifier, read, blank, quality);
sequencesRead++;
return true;
}
diff --git a/Readers/ReaderFactory.cs b/Readers/ReaderFactory.cs
index 8b63848..2b8a204 100644
--- a/Readers/ReaderFactory.cs
+++ b/Readers/ReaderFactory.cs
@@ -7,10 +7,19 @@ public static class ReaderFactory
{
public static ISequenceReader Create(CliOptions settings)
{
- return settings switch
+ return settings.Format switch
{
- { Fastq: true } => new FastqLineReader(settings.InputFilename, settings.Zipped),
- { Bam: true } => new BamReader(settings.InputFilename),
+ ReaderType.Fastq => new FastqReader(settings.InputFilename, false),
+ ReaderType.FastqGz => new FastqReader(settings.InputFilename, true),
+
+ ReaderType.FastqLine => new FastqLineReader(settings.InputFilename, false),
+ ReaderType.FastqLineGz => new FastqLineReader(settings.InputFilename, true),
+
+ ReaderType.Sam => new SamReader(settings.InputFilename, false),
+ ReaderType.SamGz => new SamReader(settings.InputFilename, true),
+
+ ReaderType.Bam => new BamReader(settings.InputFilename),
+
_ => throw new InvalidOperationException($"could not determine file type of {settings.InputFilename}")
};
}
diff --git a/Readers/ReaderType.cs b/Readers/ReaderType.cs
new file mode 100644
index 0000000..19b746a
--- /dev/null
+++ b/Readers/ReaderType.cs
@@ -0,0 +1,19 @@
+namespace Ovation.FasterQC.Net
+{
+ public enum ReaderType
+ {
+ Fastq,
+
+ FastqGz,
+
+ FastqLine,
+
+ FastqLineGz,
+
+ Sam,
+
+ SamGz,
+
+ Bam
+ }
+}
\ No newline at end of file
diff --git a/Readers/SamReader.cs b/Readers/SamReader.cs
new file mode 100644
index 0000000..ec20c03
--- /dev/null
+++ b/Readers/SamReader.cs
@@ -0,0 +1,136 @@
+using System;
+using System.Globalization;
+using System.IO;
+using System.IO.Compression;
+using System.Text;
+using static Ovation.FasterQC.Net.Utils.CliOptions;
+
+namespace Ovation.FasterQC.Net
+{
+ public class SamReader : ISequenceReader
+ {
+ private readonly FileStream inputStream;
+
+ private readonly GZipStream? gzipStream;
+
+ private readonly BufferedStream bufferedStream;
+
+ private readonly StreamReader streamReader;
+
+ private bool disposedValue;
+
+ private int sequencesRead = 0;
+
+ public int SequencesRead => sequencesRead;
+
+ public SamReader(string sam, bool gzipped = true)
+ {
+ var bufferSize = 128 * 1024;
+
+ var fileStreamOptions = new FileStreamOptions()
+ {
+ Mode = FileMode.Open,
+ BufferSize = bufferSize,
+ };
+
+ if (gzipped == true)
+ {
+ inputStream = File.Open(sam, fileStreamOptions);
+ gzipStream = new GZipStream(inputStream, CompressionMode.Decompress);
+ bufferedStream = new BufferedStream(gzipStream, bufferSize);
+ streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize);
+ }
+ else
+ {
+ inputStream = File.Open(sam, fileStreamOptions);
+ bufferedStream = new BufferedStream(inputStream, bufferSize);
+ streamReader = new StreamReader(bufferedStream, Encoding.ASCII, false, bufferSize);
+ }
+
+ ConsumeHeader();
+ }
+
+ private void ConsumeHeader()
+ {
+ try
+ {
+ while (streamReader.Peek() == '@')
+ {
+ var header = streamReader.ReadLine();
+ On(Settings.Debug, () => Console.Error.WriteLine(header));
+ }
+ }
+ catch (EndOfStreamException)
+ {
+ // swallow this, we've run out of file and a call
+ // into ReadSequence will handle the EOF case
+ }
+ }
+
+ public bool ReadSequence(out Sequence? sequence)
+ {
+ try
+ {
+ if (streamReader.EndOfStream == true)
+ {
+ goto endofstream;
+ }
+
+ var entry = streamReader.ReadLine();
+ if (entry == null)
+ {
+ goto endofstream;
+ }
+
+ // this is clearly a bad approach, we're going to be allocating a
+ // ton of small strings here, probably better to read the line,
+ // find the tabs ourselves, then pull the bytes out of the components
+ var parts = entry.Split('\t', StringSplitOptions.TrimEntries);
+
+ var identifier = Encoding.ASCII.GetBytes(parts[0]);
+ var flag = ushort.Parse(parts[1], CultureInfo.InvariantCulture);
+ var read = Encoding.ASCII.GetBytes(parts[9]);
+ var blank = Encoding.ASCII.GetBytes("");
+ var quality = Encoding.ASCII.GetBytes(parts[10]);
+
+ sequence = new Sequence(flag, identifier, read, blank, quality);
+ sequencesRead++;
+ return true;
+ }
+ catch (EndOfStreamException)
+ {
+ goto endofstream;
+ }
+
+ endofstream:
+ On(Settings.Verbose, () => Console.Error.WriteLine("End of stream"));
+ sequence = null;
+ return false;
+ }
+
+ public double ApproximateCompletion =>
+ 100.0 * inputStream.Position / inputStream.Length;
+
+ protected virtual void Dispose(bool disposing)
+ {
+ if (!disposedValue)
+ {
+ if (disposing)
+ {
+ streamReader?.Dispose();
+ bufferedStream?.Dispose();
+ gzipStream?.Dispose();
+ inputStream?.Dispose();
+ }
+
+ disposedValue = true;
+ }
+ }
+
+ public void Dispose()
+ {
+ Dispose(true);
+ GC.SuppressFinalize(this);
+ }
+ }
+}
diff --git a/Utils/CliOptions.cs b/Utils/CliOptions.cs
index a9e4b76..a4b4a29 100644
--- a/Utils/CliOptions.cs
+++ b/Utils/CliOptions.cs
@@ -1,6 +1,5 @@
using System;
using System.Collections.Generic;
-using System.Globalization;
using CommandLine;
namespace Ovation.FasterQC.Net.Utils
@@ -10,7 +9,7 @@ public class CliOptions
[Option('v', "verbose", Required = false, SetName = "Verbose", HelpText = "Set output to verbose messages.")]
public bool Verbose { get; set; }
- [Option("debug", Required = false, SetName = "Verbose", HelpText = "Show diagnostic output. Can only use with --verbose.")]
+ [Option('d', "debug", Required = false, SetName = "Verbose", HelpText = "Show diagnostic output. Can only use with --verbose.")]
public bool Debug { get; set; }
[Option('p', "progress", Required = false, SetName = "Progress", HelpText = "Show progress bar. Cannnot use with --verbose.")]
@@ -22,14 +21,8 @@ public class CliOptions
[Option('o', "output", Required = false, HelpText = "Output filename. Defaults to STDOUT.")]
public string OutputFilename { get; set; } = null!;
- [Option('b', "bam", Required = false, HelpText = "Assume BAM format.")]
- public bool Bam { get; set; }
-
- [Option('f', "fastq", Required = false, HelpText = "Assume FASTQ format.")]
- public bool Fastq { get; set; }
-
- [Option('z', "zipped", Required = false, HelpText = "Assume input file is gzipped.")]
- public bool Zipped { get; set; }
+ [Option('f', "format", Required = true, HelpText = "Type of input file.")]
+ public ReaderType Format { get; set; }
[Option('m', "modules", Required = true, Min = 1, HelpText = "Space-separated list of modules to run, or 'all'.")]
public IEnumerable ModuleNames { get; set; } = Array.Empty();
@@ -45,27 +38,5 @@ public static void On(bool condition, Action action)
action();
}
}
-
- public bool Validate()
- {
- if (!(Bam || Fastq))
- {
- Fastq = InputFilename.EndsWith(".fastq", ignoreCase: true, culture: CultureInfo.InvariantCulture)
- || InputFilename.EndsWith(".fastq.gz", ignoreCase: true, culture: CultureInfo.InvariantCulture);
-
- Bam = InputFilename.EndsWith(".bam", ignoreCase: true, culture: CultureInfo.InvariantCulture)
- || InputFilename.EndsWith(".bam.gz", ignoreCase: true, culture: CultureInfo.InvariantCulture);
- }
-
- if (Zipped == false && !string.IsNullOrWhiteSpace(InputFilename))
- {
- if (InputFilename.EndsWith(".gz", ignoreCase: true, culture: CultureInfo.InvariantCulture))
- {
- Zipped = true;
- }
- }
-
- return Fastq || Bam;
- }
}
}