diff --git a/Cargo.toml b/Cargo.toml index 547cb94..6a56586 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libradicl" -version = "0.7.0" +version = "0.8.0" authors = [ "Avi Srivastava ", "Hirak Sarkar ", @@ -15,7 +15,7 @@ readme = "README.md" repository = "https://github.com/COMBINE-lab/alevin-fry" homepage = "https://github.com/COMBINE-lab/alevin-fry" documentation = "https://alevin-fry.readthedocs.io/en/latest/" -include = ["src/*.rs", "/Cargo.toml", "/README.md", "/LICENSE"] +include = ["src/*.rs", "examples/*.rs", "/Cargo.toml", "/README.md", "/LICENSE"] keywords = [ "single-cell", "preprocessing", @@ -29,10 +29,14 @@ categories = ["command-line-utilities", "science"] snap = "1" scroll = "0.12.0" num = "0.4.1" -ahash = "0.8.8" -serde = { version = "1.0.196", features = ["derive"] } +ahash = "0.8.9" +serde = { version = "1.0.197", features = ["derive"] } dashmap = "^5.5.3" bio-types = "1.0.1" smallvec = "1.13.1" noodles-bam = "0.56.0" noodles-sam = "0.53.0" +anyhow = "1.0.80" + +[dev-dependencies] +needletail="0.5.1" diff --git a/examples/read_chunk_bulk.rs b/examples/read_chunk_bulk.rs new file mode 100644 index 0000000..ea21b2f --- /dev/null +++ b/examples/read_chunk_bulk.rs @@ -0,0 +1,32 @@ +use anyhow; +use libradicl; +use libradicl::chunk::Chunk; +use libradicl::record::{PiscemBulkReadRecord, PiscemBulkRecordContext}; +use std::io::BufReader; + +fn main() -> anyhow::Result<()> { + let fname = std::env::args().nth(1).expect("input filename"); + let f = std::fs::File::open(&fname)?; + let mut ifile = BufReader::new(f); + let p = libradicl::header::RadPrelude::from_bytes(&mut ifile)?; + if let Ok(summary) = p.summary(None) { + println!("{}", summary); + } + + let _tag_map = p.file_tags.parse_tags_from_bytes_checked(&mut ifile)?; + + // Any extra context we may need to parse the records. In this case, it's the + // size of the barcode and the umi. + let tag_context = p.get_record_context::()?; + let first_chunk = Chunk::::from_bytes(&mut ifile, &tag_context); + println!( + "Chunk :: nbytes: {}, nrecs: {}", + first_chunk.nbytes, first_chunk.nrec + ); + assert_eq!(first_chunk.nrec as usize, first_chunk.reads.len()); + for (i, r) in first_chunk.reads.iter().take(10).enumerate() { + println!("record {i}: {:?}", r); + } + + Ok(()) +} diff --git a/examples/read_chunk_single_cell.rs b/examples/read_chunk_single_cell.rs new file mode 100644 index 0000000..a0c3260 --- /dev/null +++ b/examples/read_chunk_single_cell.rs @@ -0,0 +1,32 @@ +use anyhow; +use libradicl::chunk::Chunk; +use libradicl::header; +use libradicl::record::{AlevinFryReadRecord, AlevinFryRecordContext}; +use std::io::BufReader; + +fn main() -> anyhow::Result<()> { + let fname = std::env::args().nth(1).expect("input filename"); + let f = std::fs::File::open(&fname)?; + let mut ifile = BufReader::new(f); + let p = header::RadPrelude::from_bytes(&mut ifile)?; + if let Ok(summary) = p.summary(None) { + println!("{}", summary); + } + + let tag_map = p.file_tags.parse_tags_from_bytes_checked(&mut ifile)?; + println!("tag map {:?}\n", tag_map); + // Any extra context we may need to parse the records. In this case, it's the + // size of the barcode and the umi. + let tag_context = p.get_record_context::()?; + let first_chunk = Chunk::::from_bytes(&mut ifile, &tag_context); + println!( + "Chunk :: nbytes: {}, nrecs: {}", + first_chunk.nbytes, first_chunk.nrec + ); + assert_eq!(first_chunk.nrec as usize, first_chunk.reads.len()); + for (i, r) in first_chunk.reads.iter().take(10).enumerate() { + println!("record {i}: {:?}", r); + } + + Ok(()) +} diff --git a/examples/read_header.rs b/examples/read_header.rs new file mode 100644 index 0000000..02ab669 --- /dev/null +++ b/examples/read_header.rs @@ -0,0 +1,48 @@ +use anyhow; +use libradicl::{self, rad_types}; +use std::io::BufReader; + +fn main() -> anyhow::Result<()> { + let fname = std::env::args().nth(1).expect("input filename"); + /* + { + let f = std::fs::File::open(&fname)?; + let mut ifile = BufReader::new(f); + let h = rad_types::RadHeader::from_bytes(&mut ifile)?; + + if let Ok(summary) = h.summary(None) { + println!("{}", summary); + } + + let ts = rad_types::TagSection::from_bytes(&mut ifile)?; + println!("File-level tags: {ts:?}"); + + let ts = rad_types::TagSection::from_bytes(&mut ifile)?; + println!("Read-level tags: {ts:?}"); + + let ts = rad_types::TagSection::from_bytes(&mut ifile)?; + println!("Alignment-level tags: {ts:?}"); + } + */ + + println!("\n"); + + { + let f = std::fs::File::open(&fname)?; + let mut ifile = BufReader::new(f); + let p = libradicl::header::RadPrelude::from_bytes(&mut ifile)?; + if let Ok(summary) = p.summary(None) { + println!("{}", summary); + } + let ftmp = p.file_tags.parse_tags_from_bytes(&mut ifile)?; + if let Some(rad_types::TagValue::ArrayU32(v)) = ftmp.get("ref_lengths") { + println!( + "file tag values: {:?}", + v.iter().take(10).collect::>() + ); + } else { + println!("file-level tags: {:?}", &ftmp); + } + } + Ok(()) +} diff --git a/src/chunk.rs b/src/chunk.rs new file mode 100644 index 0000000..2509ef2 --- /dev/null +++ b/src/chunk.rs @@ -0,0 +1,69 @@ +use crate::{self as libradicl}; +use libradicl::record::MappedRecord; +use scroll::Pread; +use std::io::{Cursor, Read}; + +#[derive(Debug)] +pub struct Chunk { + pub nbytes: u32, + pub nrec: u32, + pub reads: Vec, +} + +#[derive(Debug)] +#[allow(dead_code)] +pub struct CorrectedCbChunk { + pub(crate) remaining_records: u32, + pub(crate) corrected_bc: u64, + pub(crate) nrec: u32, + pub(crate) data: Cursor>, /*, + umis: Vec, + ref_offsets: Vec, + ref_ids: Vec, + */ +} + +pub struct ChunkConfig { + pub num_chunks: u64, + pub bc_type: u8, + pub umi_type: u8, +} + +impl Chunk { + /// Read the header of the next [Chunk] from the provided `reader`. This + /// function returns a tuple representing the number of bytes and number of + /// records, respectively, in the chunk. + #[inline] + pub fn read_header(reader: &mut T) -> (u32, u32) { + let mut buf = [0u8; 8]; + reader.read_exact(&mut buf).unwrap(); + let nbytes = buf.pread::(0).unwrap(); + let nrec = buf.pread::(4).unwrap(); + (nbytes, nrec) + } + + /// Read the next [Chunk] from the provided reader and return it. + #[inline] + pub fn from_bytes(reader: &mut T, ctx: &R::ParsingContext) -> Self { + let (nbytes, nrec) = Self::read_header(reader); + let mut c = Self { + nbytes, + nrec, + reads: Vec::::with_capacity(nrec as usize), + }; + + for _ in 0..(nrec as usize) { + c.reads.push(R::from_bytes_with_context(reader, ctx)); + } + + c + } + + /// Peeks to the first [libradicl::record::AlevinFryReadRecord] in the buffer `buf`, and returns + /// the barcode and umi associated with this record. It is assumed + /// that there is at least one [libradicl::record::AlevinFryReadRecord] present in the buffer. + #[inline] + pub fn peek_record(buf: &[u8], ctx: &R::ParsingContext) -> R::PeekResult { + R::peek_record(buf, ctx) + } +} diff --git a/src/constants.rs b/src/constants.rs new file mode 100644 index 0000000..fcae1b3 --- /dev/null +++ b/src/constants.rs @@ -0,0 +1 @@ +pub(crate) const MAX_REF_NAME_LEN: usize = 65536; diff --git a/src/header.rs b/src/header.rs new file mode 100644 index 0000000..e26e984 --- /dev/null +++ b/src/header.rs @@ -0,0 +1,176 @@ +use crate::{self as libradicl, constants}; +use libradicl::rad_types::{TagSection, TagSectionLabel}; +use libradicl::record::RecordContext; +use noodles_sam as sam; +use scroll::Pread; +use std::io::Read; + +/// The [RadPrelude] groups together the [RadHeader] +/// as well as the relevant top-level [TagSection]s of the file. +/// It constitutes everything in the initial file prior to the +/// start of the first [libradicl::chunk::Chunk]. +pub struct RadPrelude { + pub hdr: RadHeader, + pub file_tags: TagSection, + pub read_tags: TagSection, + pub aln_tags: TagSection, +} + +/// The [RadHeader] contains the relevant information about the +/// references against which the reads in this file were mapped and +/// information about the way in which mapping was performed. +pub struct RadHeader { + pub is_paired: u8, + pub ref_count: u64, + pub ref_names: Vec, + pub num_chunks: u64, +} + +impl Default for RadHeader { + fn default() -> Self { + Self::new() + } +} + +impl RadHeader { + /// Create a new empty [RadHeader] + pub fn new() -> Self { + Self { + is_paired: 0, + ref_count: 0, + ref_names: vec![], + num_chunks: 0, + } + } + + /// Create and return a new [RadHeader] by reading the contents of the + /// `reader`. If the reader is positioned such that a valid [RadHeader] comes + /// next, then this function returns [Ok(RadHeader)], otherwise, it returns + /// an [anyhow::Error] explaining the failure to parse the [RadHeader]. + pub fn from_bytes(reader: &mut T) -> anyhow::Result { + let mut rh = RadHeader::new(); + + // size of the longest allowable string. + let mut buf = [0u8; constants::MAX_REF_NAME_LEN]; + reader.read_exact(&mut buf[0..9])?; + rh.is_paired = buf.pread(0)?; + rh.ref_count = buf.pread::(1)?; + + // we know how many names we will read in. + rh.ref_names.reserve_exact(rh.ref_count as usize); + + let mut num_read = 0u64; + while num_read < rh.ref_count { + // the length of the string + reader.read_exact(&mut buf[0..2])?; + let l: usize = buf.pread::(0)? as usize; + // the string itself + reader.read_exact(&mut buf[0..l])?; + rh.ref_names + .push(std::str::from_utf8(&buf[0..l])?.to_string()); + num_read += 1; + } + + reader.read_exact(&mut buf[0..8])?; + rh.num_chunks = buf.pread::(0)?; + Ok(rh) + } + + /// Create and return a [RadHeader] from the provided BAM/SAM header + /// (represented by the noodles [sam::Header] `header`). + /// **Note**: The returned [RadHeader] will *not* have a value for the `num_chunks` + /// field, which will remain set at 0, nor will it set a meaningful value for the + /// `is_paried` flag, since the SAM/BAM header itself doesn't encode this information. + pub fn from_bam_header(header: &sam::Header) -> RadHeader { + let mut rh = RadHeader { + is_paired: 0, + ref_count: 0, + ref_names: vec![], + num_chunks: 0, + }; + + let ref_seqs = header.reference_sequences(); + rh.ref_count = ref_seqs.len() as u64; + // we know how many names we will read in. + rh.ref_names.reserve_exact(rh.ref_count as usize); + for (k, _v) in ref_seqs.iter() { + rh.ref_names.push(k.to_string()); + } + rh + } + + /// Returns the size, in bytes, that this [RadHeader] will take + /// if written to an output stream. + pub fn get_size(&self) -> usize { + let mut tot_size = 0usize; + tot_size += std::mem::size_of_val(&self.is_paired) + std::mem::size_of_val(&self.ref_count); + // each name takes 2 bytes for the length, plus the actual + // number of bytes required by the string itself. + for t in self.ref_names.iter().map(|a| a.len()) { + tot_size += std::mem::size_of::() + t; + } + tot_size += std::mem::size_of_val(&self.num_chunks); + tot_size + } + + pub fn summary(&self, num_refs: Option) -> anyhow::Result { + use std::fmt::Write as _; + let mut s = String::new(); + writeln!(&mut s, "RadHeader {{")?; + writeln!(&mut s, "is_paired: {}", self.is_paired)?; + writeln!(&mut s, "ref_count: {}", self.ref_count)?; + + let refs_to_print = match num_refs { + Some(rcount) => rcount.min(self.ref_count as usize), + None => (self.ref_count as usize).min(10_usize), + }; + + for rn in self.ref_names.iter().take(refs_to_print) { + writeln!(&mut s, " ref: {}", rn)?; + } + writeln!(&mut s, " ...")?; + + writeln!(&mut s, "num_chunks: {}", self.num_chunks)?; + writeln!(&mut s, "}}")?; + Ok(s) + } +} + +impl RadPrelude { + pub fn from_bytes(reader: &mut T) -> anyhow::Result { + let hdr = RadHeader::from_bytes(reader)?; + let file_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::FileTags)?; + let read_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::ReadTags)?; + let aln_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::AlignmentTags)?; + + //let file_tag_vals = file_tags.parse_tags_from_bytes(reader)?; + //println!("file-level tag values: {:?}", file_tag_vals); + + Ok(Self { + hdr, + file_tags, + read_tags, + aln_tags, + }) + } + + pub fn summary(&self, num_refs: Option) -> anyhow::Result { + use std::fmt::Write as _; + let mut s = self.hdr.summary(num_refs)?; + writeln!(&mut s, "[[{:?}]]", self.file_tags)?; + writeln!(&mut s, "[[{:?}]]", self.read_tags)?; + writeln!(&mut s, "[[{:?}]]", self.aln_tags)?; + //writeln!(&mut s, "file-level tag values [{:?}]", self.file_tag_vals)?; + Ok(s) + } + + /// Obtain a [RecordContext] for a record of type `R` from this prelude, by + /// using the associated [TagSection]s. **Note**: Since this function + /// constructs the resulting `R` itself, and doesn't take any `R` parameter, + /// then it must always be invoked with the proper + /// [turbofish](https://doc.rust-lang.org/1.30.0/book/2018-edition/appendix-02-operators.html?highlight=turbofish#non-operator-symbols) + /// notation. + pub fn get_record_context(&self) -> anyhow::Result { + R::get_context_from_tag_section(&self.file_tags, &self.read_tags, &self.aln_tags) + } +} diff --git a/src/io.rs b/src/io.rs new file mode 100644 index 0000000..ca308f6 --- /dev/null +++ b/src/io.rs @@ -0,0 +1,56 @@ +use crate::libradicl::rad_types::RadIntId; +use scroll::Pread; +use std::io::Write; +use std::io::{Cursor, Read}; + +pub fn read_into_u64(reader: &mut T, rt: &RadIntId) -> u64 { + let mut rbuf = [0u8; 8]; + + let v: u64 = match rt { + RadIntId::U8 => { + reader.read_exact(&mut rbuf[0..1]).unwrap(); + rbuf.pread::(0).unwrap() as u64 + } + RadIntId::U16 => { + reader.read_exact(&mut rbuf[0..2]).unwrap(); + rbuf.pread::(0).unwrap() as u64 + } + RadIntId::U32 => { + reader.read_exact(&mut rbuf[0..4]).unwrap(); + rbuf.pread::(0).unwrap() as u64 + } + RadIntId::U64 => { + reader.read_exact(&mut rbuf[0..8]).unwrap(); + rbuf.pread::(0).unwrap() + } + }; + v +} + +pub fn write_str_bin(v: &str, type_id: &RadIntId, owriter: &mut Cursor>) { + match type_id { + RadIntId::U8 => { + owriter + .write_all(&(v.len() as u8).to_le_bytes()) + .expect("coudn't write to output file"); + } + RadIntId::U16 => { + owriter + .write_all(&(v.len() as u16).to_le_bytes()) + .expect("coudn't write to output file"); + } + RadIntId::U32 => { + owriter + .write_all(&(v.len() as u32).to_le_bytes()) + .expect("coudn't write to output file"); + } + RadIntId::U64 => { + owriter + .write_all(&(v.len() as u64).to_le_bytes()) + .expect("coudn't write to output file"); + } + } + owriter + .write_all(v.as_bytes()) + .expect("coudn't write to output file"); +} diff --git a/src/lib.rs b/src/lib.rs index 1a417bb..0da7648 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,9 @@ use crate as libradicl; -use self::libradicl::rad_types::{CorrectedCbChunk, RadIntId, ReadRecord}; +use self::libradicl::chunk::CorrectedCbChunk; +use self::libradicl::rad_types::RadIntId; +use self::libradicl::record::AlevinFryReadRecord; use self::libradicl::schema::TempCellInfo; #[allow(unused_imports)] use ahash::{AHasher, RandomState}; @@ -28,10 +30,17 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::vec::Vec; +pub mod chunk; +pub mod constants; pub mod exit_codes; +pub mod header; +pub mod io; pub mod rad_types; +pub mod record; pub mod schema; pub mod utils; +#[macro_use] +mod macros; // Name of the program, to be used in diagnostic messages. static LIB_NAME: &str = "libradicl"; @@ -108,7 +117,7 @@ impl BarcodeLookupMap { end: self.offsets[(query_pref + 1) as usize], }; - let qs = qrange.start as usize; + let qs = qrange.start; // if we can, then we return the found barcode and that there was 1 best hit if let Ok(res) = self.barcodes[qrange].binary_search(&query) { @@ -143,7 +152,7 @@ impl BarcodeLookupMap { end: self.offsets[(query_pref + 1) as usize], }; - let qs = qrange.start as usize; + let qs = qrange.start; if try_exact { // first, we try to find exactly. @@ -164,7 +173,7 @@ impl BarcodeLookupMap { // that are 1 mismatch off in the suffix. if !(std::ops::Range::::is_empty(&qrange)) { // the initial offset of suffixes for this prefix - let qs = qrange.start as usize; + let qs = qrange.start; // for each position in the suffix for i in (0..suffix_bits).step_by(2) { @@ -205,7 +214,7 @@ impl BarcodeLookupMap { start: self.offsets[query_pref as usize], end: self.offsets[(query_pref + 1) as usize], }; - let qs = qrange.start as usize; + let qs = qrange.start; if let Ok(res) = self.barcodes[qrange].binary_search(&nquery) { ret = Some(qs + res); num_neighbors += 1; @@ -295,7 +304,7 @@ pub fn dump_chunk(v: &mut CorrectedCbChunk, owriter: &Mutex>) { owriter.lock().unwrap().write_all(v.data.get_ref()).unwrap(); } -/// Given a `BufReader` from which to read a set of records that +/// Given a [BufReader]`` from which to read a set of records that /// should reside in the same collated bucket, this function will /// collate the records by cell barcode, filling them into a chunk of /// memory exactly as they will reside on disk. If `compress` is true @@ -326,7 +335,7 @@ pub fn collate_temporary_bucket_twopass( // read the header of the record // we don't bother reading the whole thing here // because we will just copy later as need be - let tup = ReadRecord::from_bytes_record_header(reader, bct, umit); + let tup = AlevinFryReadRecord::from_bytes_record_header(reader, bct, umit); // get the entry for this chunk, or create a new one let v = cb_byte_map.entry(tup.0).or_insert(TempCellInfo { @@ -344,10 +353,10 @@ pub fn collate_temporary_bucket_twopass( reader.read_exact(&mut tbuf[0..(size_of_u32 * na)]).unwrap(); // compute the total number of bytes this record requires let nbytes = calc_record_bytes(na); - (*v).offset += nbytes as u64; - (*v).nbytes += nbytes as u32; - (*v).nrec += 1; - total_bytes += nbytes as usize; + v.offset += nbytes as u64; + v.nbytes += nbytes as u32; + v.nrec += 1; + total_bytes += nbytes; } // each cell will have a header (8 bytes each) @@ -363,14 +372,14 @@ pub fn collate_temporary_bucket_twopass( // jump to the position where this chunk should start // and write the header output_buffer.set_position(next_offset); - let cell_bytes = (*v).nbytes as u32; - let cell_rec = (*v).nrec as u32; + let cell_bytes = v.nbytes; + let cell_rec = v.nrec; output_buffer.write_all(&cell_bytes.to_le_bytes()).unwrap(); output_buffer.write_all(&cell_rec.to_le_bytes()).unwrap(); // where we will start writing records for this cell - (*v).offset = output_buffer.position(); + v.offset = output_buffer.position(); // the number of bytes allocated to this chunk - let nbytes = (*v).nbytes as u64; + let nbytes = v.nbytes as u64; // the next record will start after this one next_offset += nbytes; } @@ -387,7 +396,7 @@ pub fn collate_temporary_bucket_twopass( // read the header of the record // we don't bother reading the whole thing here // because we will just copy later as need be - let tup = ReadRecord::from_bytes_record_header(reader, bct, umit); + let tup = AlevinFryReadRecord::from_bytes_record_header(reader, bct, umit); // get the entry for this chunk, or create a new one if let Some(v) = cb_byte_map.get_mut(&tup.0) { @@ -403,15 +412,13 @@ pub fn collate_temporary_bucket_twopass( umit.write_to(tup.1, &mut output_buffer).unwrap(); // read the alignment records - reader - .read_exact(&mut tbuf[0..(size_of_u32 as usize * na)]) - .unwrap(); + reader.read_exact(&mut tbuf[0..(size_of_u32 * na)]).unwrap(); // write them output_buffer - .write_all(&tbuf[..(size_of_u32 as usize * na)]) + .write_all(&tbuf[..(size_of_u32 * na)]) .unwrap(); - (*v).offset = output_buffer.position(); + v.offset = output_buffer.position(); } else { panic!("should not have any barcodes we can't find"); } @@ -461,7 +468,7 @@ pub fn collate_temporary_bucket( // read the header of the record // we don't bother reading the whole thing here // because we will just copy later as need be - let tup = ReadRecord::from_bytes_record_header(reader, bct, umit); + let tup = AlevinFryReadRecord::from_bytes_record_header(reader, bct, umit); // get the entry for this chunk, or create a new one let v = output_cache @@ -469,17 +476,17 @@ pub fn collate_temporary_bucket( .or_insert_with(|| CorrectedCbChunk::from_label_and_counter(tup.0, est_num_rec)); // keep track of the number of records we're writing - (*v).nrec += 1; + v.nrec += 1; // write the num align let na = tup.2; - (*v).data.write_all(&na.to_le_bytes()).unwrap(); + v.data.write_all(&na.to_le_bytes()).unwrap(); // write the corrected barcode - bct.write_to(tup.0, &mut (*v).data).unwrap(); - umit.write_to(tup.1, &mut (*v).data).unwrap(); + bct.write_to(tup.0, &mut v.data).unwrap(); + umit.write_to(tup.1, &mut v.data).unwrap(); // read the alignment records reader.read_exact(&mut tbuf[0..(4 * na as usize)]).unwrap(); // write them - (*v).data.write_all(&tbuf[..(4 * na as usize)]).unwrap(); + v.data.write_all(&tbuf[..(4 * na as usize)]).unwrap(); } } @@ -502,11 +509,10 @@ pub fn process_corrected_cb_chunk( let nrec = buf.pread::(4).unwrap(); // for each record, read it for _ in 0..(nrec as usize) { - let tup = ReadRecord::from_bytes_record_header(reader, bct, umit); - //let rr = ReadRecord::from_bytes_keep_ori(reader, &bct, &umit, expected_ori); + let tup = AlevinFryReadRecord::from_bytes_record_header(reader, bct, umit); // if this record had a correct or correctable barcode if let Some(corrected_id) = correct_map.get(&tup.0) { - let rr = ReadRecord::from_bytes_with_header_keep_ori( + let rr = AlevinFryReadRecord::from_bytes_with_header_keep_ori( reader, tup.0, tup.1, @@ -562,7 +568,7 @@ impl TempBucket { bucket_id, bucket_writer: Arc::new(Mutex::new(BufWriter::with_capacity( 4096_usize, - File::create(parent.join(&format!("bucket_{}.tmp", bucket_id))).unwrap(), + File::create(parent.join(format!("bucket_{}.tmp", bucket_id))).unwrap(), ))), num_chunks: 0u32, num_records: 0u32, @@ -606,11 +612,11 @@ pub fn dump_corrected_cb_chunk_to_temp_file( // for each record, read it for _ in 0..(nrec as usize) { - let tup = ReadRecord::from_bytes_record_header(reader, bct, umit); + let tup = AlevinFryReadRecord::from_bytes_record_header(reader, bct, umit); // if this record had a correct or correctable barcode if let Some(corrected_id) = correct_map.get(&tup.0) { - let rr = ReadRecord::from_bytes_with_header_keep_ori( + let rr = AlevinFryReadRecord::from_bytes_with_header_keep_ori( reader, tup.0, tup.1, @@ -641,9 +647,7 @@ pub fn dump_corrected_cb_chunk_to_temp_file( // then first flush the buffer to file. if len + nb as usize >= flush_limit { let mut filebuf = v.bucket_writer.lock().unwrap(); - filebuf - .write_all(&bcursor.get_ref()[0..len as usize]) - .unwrap(); + filebuf.write_all(&bcursor.get_ref()[0..len]).unwrap(); // and reset the local buffer cursor bcursor.set_position(0); } @@ -684,17 +688,13 @@ pub fn dump_corrected_cb_chunk_to_temp_file( } pub fn as_u8_slice(v: &[u32]) -> &[u8] { - unsafe { - std::slice::from_raw_parts( - v.as_ptr() as *const u8, - v.len() * std::mem::size_of::(), - ) - } + unsafe { std::slice::from_raw_parts(v.as_ptr() as *const u8, std::mem::size_of_val(v)) } } #[cfg(test)] mod tests { use crate::BarcodeLookupMap; + use needletail; #[test] fn test_barcode_lookup_map() { diff --git a/src/macros.rs b/src/macros.rs new file mode 100644 index 0000000..0e97b53 --- /dev/null +++ b/src/macros.rs @@ -0,0 +1,12 @@ +// macro generalizing solution from +// https://stackoverflow.com/questions/77388769/convert-vecu8-to-vecfloat-in-rust +#[macro_export] +macro_rules! u8_to_vec_of { + ($a:expr, $b:ty) => { + $a.chunks_exact(std::mem::size_of::<$b>()) + .map(TryInto::try_into) + .map(Result::unwrap) + .map(<$b>::from_le_bytes) + .collect() + }; +} diff --git a/src/rad_types.rs b/src/rad_types.rs index ee73093..48a974a 100644 --- a/src/rad_types.rs +++ b/src/rad_types.rs @@ -7,69 +7,47 @@ * License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause */ -use crate as libradicl; - -use self::libradicl::utils; -use bio_types::strand::*; +use crate::{self as libradicl, constants}; +use anyhow::{self, bail}; +use libradicl::u8_to_vec_of; use num::cast::AsPrimitive; -use noodles_sam as sam; use scroll::Pread; +use std::io::Read; use std::io::Write; -use std::io::{Cursor, Read}; use std::mem; -pub struct RadHeader { - pub is_paired: u8, - pub ref_count: u64, - pub ref_names: Vec, - pub num_chunks: u64, -} - +#[derive(Clone, Debug)] pub struct TagDesc { pub name: String, - pub typeid: u8, + pub typeid: RadType, } +#[derive(Clone, Debug)] +pub enum TagSectionLabel { + FileTags, + ReadTags, + AlignmentTags, + Unlabeled, +} + +/// A [TagSection] consists of a series of [TagDesc]s that are +/// logically grouped together as tags for a specific unit +#[derive(Clone, Debug)] pub struct TagSection { + pub label: TagSectionLabel, pub tags: Vec, } -// The below are currently hard-coded -// until we decide how to solve this -// generally +/// The below are currently hard-coded +/// until we decide how to solve this +/// generally #[derive(Debug)] pub struct FileTags { pub bclen: u16, pub umilen: u16, } -#[derive(Debug)] -pub struct ReadRecord { - pub bc: u64, - pub umi: u64, - pub dirs: Vec, - pub refs: Vec, -} -#[derive(Debug)] -pub struct Chunk { - pub nbytes: u32, - pub nrec: u32, - pub reads: Vec, -} -#[derive(Debug)] -#[allow(dead_code)] -pub struct CorrectedCbChunk { - pub(crate) remaining_records: u32, - pub(crate) corrected_bc: u64, - pub(crate) nrec: u32, - pub(crate) data: Cursor>, /*, - umis: Vec, - ref_offsets: Vec, - ref_ids: Vec, - */ -} - -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum RadIntId { U8, U16, @@ -77,6 +55,56 @@ pub enum RadIntId { U64, } +impl RadIntId { + #[inline] + pub fn size_of(&self) -> usize { + match self { + Self::U8 => mem::size_of::(), + Self::U16 => mem::size_of::(), + Self::U32 => mem::size_of::(), + Self::U64 => mem::size_of::(), + } + } +} + +impl From for RadIntId { + fn from(x: u8) -> Self { + match x { + 1 => Self::U8, + 2 => Self::U16, + 3 => Self::U32, + 4 => Self::U64, + _ => panic!("Should not happen"), + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum RadFloatId { + F32, + F64, +} + +impl RadFloatId { + #[inline] + pub fn size_of(&self) -> usize { + match self { + Self::F32 => mem::size_of::(), + Self::F64 => mem::size_of::(), + } + } +} + +impl From for RadFloatId { + fn from(x: u8) -> Self { + match x { + 5 => Self::F32, + 6 => Self::F64, + _ => panic!("Should not happen"), + } + } +} + pub trait PrimitiveInteger: AsPrimitive + AsPrimitive @@ -107,13 +135,9 @@ impl< } impl RadIntId { + #[inline] pub fn bytes_for_type(&self) -> usize { - match self { - Self::U8 => std::mem::size_of::(), - Self::U16 => std::mem::size_of::(), - Self::U32 => std::mem::size_of::(), - Self::U64 => std::mem::size_of::(), - } + self.size_of() } /// Based on the variant of the current enum, write the value `v` @@ -146,35 +170,80 @@ impl RadIntId { } } } + + pub fn read_into_usize(&self, buf: &[u8]) -> usize { + match self { + Self::U8 => buf.pread::(0).unwrap() as usize, + Self::U16 => buf.pread::(0).unwrap() as usize, + Self::U32 => buf.pread::(0).unwrap() as usize, + Self::U64 => buf.pread::(0).unwrap() as usize, + } + } } -pub struct ChunkConfig { - pub num_chunks: u64, - pub bc_type: u8, - pub umi_type: u8, +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum RadAtomicId { + Int(RadIntId), + Float(RadFloatId), + String, +} + +impl RadAtomicId { + #[inline] + pub fn size_of(&self) -> usize { + match self { + Self::Int(x) => x.size_of(), + Self::Float(x) => x.size_of(), + Self::String => panic!("RadAtomicId::String does not have a fixed type"), + } + } } -#[derive(Copy, Clone)] +impl From for RadAtomicId { + fn from(x: u8) -> Self { + match x { + 1 => Self::Int(RadIntId::U8), + 2 => Self::Int(RadIntId::U16), + 3 => Self::Int(RadIntId::U32), + 4 => Self::Int(RadIntId::U64), + 5 => Self::Float(RadFloatId::F32), + 6 => Self::Float(RadFloatId::F64), + 8 => Self::String, + x => panic!("Should not happen, num is {x}"), + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum RadType { Bool, - U8, - U16, - U32, - U64, - F32, - F64, + Int(RadIntId), + Float(RadFloatId), + // holds length type and value type, but not length + // and data themselves + Array(RadIntId, RadAtomicId), + // does not hold length, just a marker for the type + String, +} + +impl RadType { + #[inline] + pub fn is_int_type(&self) -> bool { + matches!(self, Self::Int(_)) + } } pub fn encode_type_tag(type_tag: RadType) -> Option { match type_tag { RadType::Bool => Some(0), - RadType::U8 => Some(1), - RadType::U16 => Some(2), - RadType::U32 => Some(3), - RadType::U64 => Some(4), - RadType::F32 => Some(5), - RadType::F64 => Some(6), - //_ => None, + RadType::Int(RadIntId::U8) => Some(1), + RadType::Int(RadIntId::U16) => Some(2), + RadType::Int(RadIntId::U32) => Some(3), + RadType::Int(RadIntId::U64) => Some(4), + RadType::Float(RadFloatId::F32) => Some(5), + RadType::Float(RadFloatId::F64) => Some(6), + RadType::Array(_, _) => Some(7), + RadType::String => Some(8), //_ => None, } } @@ -188,210 +257,118 @@ pub fn decode_int_type_tag(type_id: u8) -> Option { } } -fn read_into_u64(reader: &mut T, rt: &RadIntId) -> u64 { - let mut rbuf = [0u8; 8]; - - let v: u64 = match rt { - RadIntId::U8 => { - reader.read_exact(&mut rbuf[0..1]).unwrap(); - rbuf.pread::(0).unwrap() as u64 - } - RadIntId::U16 => { - reader.read_exact(&mut rbuf[0..2]).unwrap(); - rbuf.pread::(0).unwrap() as u64 - } - RadIntId::U32 => { - reader.read_exact(&mut rbuf[0..4]).unwrap(); - rbuf.pread::(0).unwrap() as u64 - } - RadIntId::U64 => { - reader.read_exact(&mut rbuf[0..8]).unwrap(); - rbuf.pread::(0).unwrap() - } - }; - v +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum MappingType { + Unmapped, + SingleMapped, + MappedFirstOrphan, + MappedSecondOrphan, + MappedPair, } -impl ReadRecord { - pub fn is_empty(&self) -> bool { - self.refs.is_empty() - } - - pub fn from_bytes(reader: &mut T, bct: &RadIntId, umit: &RadIntId) -> Self { - let mut rbuf = [0u8; 255]; - - reader.read_exact(&mut rbuf[0..4]).unwrap(); - let na = rbuf.pread::(0).unwrap(); - let bc = read_into_u64(reader, bct); - let umi = read_into_u64(reader, umit); - - let mut rec = Self { - bc, - umi, - dirs: Vec::with_capacity(na as usize), - refs: Vec::with_capacity(na as usize), - }; - - //println!("number of records : {:?}",na); - - for _ in 0..(na as usize) { - reader.read_exact(&mut rbuf[0..4]).unwrap(); - let v = rbuf.pread::(0).unwrap(); - let dir = (v & utils::MASK_LOWER_31_U32) != 0; - rec.dirs.push(dir); - rec.refs.push(v & utils::MASK_TOP_BIT_U32); +impl MappingType { + pub fn from_u8(t: u8) -> Self { + match t { + 0 => MappingType::Unmapped, + 1 => MappingType::SingleMapped, + 2 => MappingType::MappedFirstOrphan, + 3 => MappingType::MappedSecondOrphan, + 4 => MappingType::MappedPair, + _ => MappingType::Unmapped, } - - rec } - pub fn from_bytes_record_header( - reader: &mut T, - bct: &RadIntId, - umit: &RadIntId, - ) -> (u64, u64, u32) { - let mut rbuf = [0u8; 4]; - reader.read_exact(&mut rbuf).unwrap(); - let na = u32::from_le_bytes(rbuf); //.pread::(0).unwrap(); - let bc = read_into_u64(reader, bct); - let umi = read_into_u64(reader, umit); - (bc, umi, na) - } - - pub fn from_bytes_with_header_keep_ori( - reader: &mut T, - bc: u64, - umi: u64, - na: u32, - expected_ori: &Strand, - ) -> Self { - let mut rbuf = [0u8; 255]; - let mut rec = Self { - bc, - umi, - dirs: Vec::with_capacity(na as usize), - refs: Vec::with_capacity(na as usize), - }; - - for _ in 0..(na as usize) { - reader.read_exact(&mut rbuf[0..4]).unwrap(); - let v = rbuf.pread::(0).unwrap(); - - // fw if the leftmost bit is 1, otherwise rc - let strand = if (v & utils::MASK_LOWER_31_U32) > 0 { - Strand::Forward - } else { - Strand::Reverse - }; - - if expected_ori.same(&strand) || expected_ori.is_unknown() { - rec.refs.push(v & utils::MASK_TOP_BIT_U32); - } + #[inline] + pub fn get_mask(&self) -> u32 { + match &self { + // if unmapped, we ignore flags all together. + MappingType::Unmapped => 0b00, + // if paired we care about both read and mate. + MappingType::MappedPair => 0b11, + // if orphan or single, we care only about read + // mate flag should be ignored. + _ => 0b10, } - - // make sure these are sorted in this step. - rec.refs.sort_unstable(); - rec } - pub fn from_bytes_keep_ori( - reader: &mut T, - bct: &RadIntId, - umit: &RadIntId, - expected_ori: &Strand, - ) -> Self { - let mut rbuf = [0u8; 255]; - - reader.read_exact(&mut rbuf[0..4]).unwrap(); - let na = rbuf.pread::(0).unwrap(); - - let bc = read_into_u64(reader, bct); - let umi = read_into_u64(reader, umit); - - let mut rec = Self { - bc, - umi, - dirs: Vec::with_capacity(na as usize), - refs: Vec::with_capacity(na as usize), - }; + #[inline] + pub fn is_orphan(&self) -> bool { + matches!( + &self, + MappingType::MappedFirstOrphan | MappingType::MappedSecondOrphan + ) + } +} - for _ in 0..(na as usize) { - reader.read_exact(&mut rbuf[0..4]).unwrap(); - let v = rbuf.pread::(0).unwrap(); +#[derive(Debug, Copy, Clone)] +pub enum MappedFragmentOrientation { + Reverse, + Forward, + ReverseReverse, + ReverseForward, + ForwardReverse, + ForwardForward, + Unknown, +} - // fw if the leftmost bit is 1, otherwise rc - let strand = if (v & utils::MASK_LOWER_31_U32) > 0 { - Strand::Forward +impl MappedFragmentOrientation { + pub fn from_u32_paired_status(n: u32, m: MappingType) -> Self { + // if not paired, then we don't care about + // the lowest order bit so shift it off + if matches!( + m, + MappingType::SingleMapped + | MappingType::MappedFirstOrphan + | MappingType::MappedSecondOrphan + ) { + if (n & 0b10) == 2 { + MappedFragmentOrientation::Forward } else { - Strand::Reverse - }; - - if expected_ori.same(&strand) || expected_ori.is_unknown() { - rec.refs.push(v & utils::MASK_TOP_BIT_U32); + MappedFragmentOrientation::Reverse + } + } else { + match n { + 0 => MappedFragmentOrientation::ReverseReverse, + 1 => MappedFragmentOrientation::ReverseForward, + 2 => MappedFragmentOrientation::ForwardReverse, + 3 => MappedFragmentOrientation::ForwardForward, + _ => MappedFragmentOrientation::Unknown, } } - - // make sure these are sorted in this step. - rec.refs.sort_unstable(); - rec } } -impl Chunk { - pub fn read_header(reader: &mut T) -> (u32, u32) { - let mut buf = [0u8; 8]; - - reader.read_exact(&mut buf).unwrap(); - let nbytes = buf.pread::(0).unwrap(); - let nrec = buf.pread::(4).unwrap(); - (nbytes, nrec) - } - - pub fn from_bytes(reader: &mut T, bct: &RadIntId, umit: &RadIntId) -> Self { - let mut buf = [0u8; 8]; - - reader.read_exact(&mut buf).unwrap(); - let nbytes = buf.pread::(0).unwrap(); - let nrec = buf.pread::(4).unwrap(); - let mut c = Self { - nbytes, - nrec, - reads: Vec::with_capacity(nrec as usize), - }; - - for _ in 0..(nrec as usize) { - c.reads.push(ReadRecord::from_bytes(reader, bct, umit)); +impl From for u32 { + fn from(item: MappedFragmentOrientation) -> Self { + match item { + MappedFragmentOrientation::ForwardReverse => 0b011, + MappedFragmentOrientation::ForwardForward => 0b101, + MappedFragmentOrientation::ReverseReverse => 0b110, + MappedFragmentOrientation::ReverseForward => 0b100, + MappedFragmentOrientation::Forward => 0b1, + MappedFragmentOrientation::Reverse => 0b10, + MappedFragmentOrientation::Unknown => 0b0, } - - c } +} - /// peeks to the first record in the buffer `buf`, and returns - /// the barcode and umi associated with this record. It is assumed - /// that there is at least one record present in the buffer. - pub fn peek_record(buf: &[u8], bct: &RadIntId, umit: &RadIntId) -> (u64, u64) { - let na_size = mem::size_of::(); - let bc_size = bct.bytes_for_type(); - - let _na = buf.pread::(0).unwrap(); - - let bc = match bct { - RadIntId::U8 => buf.pread::(na_size).unwrap() as u64, - RadIntId::U16 => buf.pread::(na_size).unwrap() as u64, - RadIntId::U32 => buf.pread::(na_size).unwrap() as u64, - RadIntId::U64 => buf.pread::(na_size).unwrap(), - }; - let umi = match umit { - RadIntId::U8 => buf.pread::(na_size + bc_size).unwrap() as u64, - RadIntId::U16 => buf.pread::(na_size + bc_size).unwrap() as u64, - RadIntId::U32 => buf.pread::(na_size + bc_size).unwrap() as u64, - RadIntId::U64 => buf.pread::(na_size + bc_size).unwrap(), - }; - (bc, umi) +impl From for MappedFragmentOrientation { + fn from(item: u32) -> Self { + match item { + 0b011 => MappedFragmentOrientation::ForwardReverse, + 0b101 => MappedFragmentOrientation::ForwardForward, + 0b110 => MappedFragmentOrientation::ReverseReverse, + 0b100 => MappedFragmentOrientation::ReverseForward, + 0b1 => MappedFragmentOrientation::Forward, + 0b10 => MappedFragmentOrientation::Reverse, + _ => MappedFragmentOrientation::Unknown, + } } } impl FileTags { + /// Reads the FileTags from the provided `reader` and return the + /// barcode length and umi length pub fn from_bytes(reader: &mut T) -> Self { let mut buf = [0u8; 4]; reader.read_exact(&mut buf).unwrap(); @@ -403,128 +380,476 @@ impl FileTags { } } +/// Convert from a u8 to the corresponding RadType. +/// This will only work for *non-aggregate* types +/// (i.e. it will not work for the array type, since +/// the type requiers more than a u8 worth of information). +impl From for RadType { + fn from(x: u8) -> Self { + match x { + 0 => RadType::Bool, + 1 => RadType::Int(RadIntId::U8), + 2 => RadType::Int(RadIntId::U16), + 3 => RadType::Int(RadIntId::U32), + 4 => RadType::Int(RadIntId::U64), + 5 => RadType::Float(RadFloatId::F32), + 6 => RadType::Float(RadFloatId::F64), + 7 => panic!("Should not happen"), + 8 => RadType::String, + _ => panic!("Should not happen"), + } + } +} + +/// This enum holds the different values that can be represented +/// in a RAD tag. For the aggregate (i.e. array) types, the type +/// of the element being stored in the array is encoded in the +/// [TagValue] variant, but the length of the array is not. +#[derive(Debug, PartialEq)] +pub enum TagValue { + Bool(bool), + U8(u8), + U16(u16), + U32(u32), + U64(u64), + F32(f32), + F64(f64), + ArrayU8(Vec), + ArrayU16(Vec), + ArrayU32(Vec), + ArrayU64(Vec), + ArrayF32(Vec), + ArrayF64(Vec), + ArrayString(Vec), + String(String), +} + impl TagDesc { - pub fn from_bytes(reader: &mut T) -> TagDesc { + /// Attempts to read a [TagDesc] from the provided `reader`. If the + /// `reader` is positioned at the start of a valid [TagDesc], then this + /// [TagDesc] is returned. Otherwise, a description of the error is returned + /// via an [anyhow::Error]. + pub fn from_bytes(reader: &mut T) -> anyhow::Result { // space for the string length (1 byte) // the longest string possible (255 char) // and the typeid - let mut buf = [0u8; 257]; - reader.read_exact(&mut buf[0..2]).unwrap(); - let str_len = buf.pread::(0).unwrap() as usize; + let mut buf = [0u8; constants::MAX_REF_NAME_LEN]; + reader.read_exact(&mut buf[0..2])?; + let str_len = buf.pread::(0)? as usize; // read str_len + 1 to get the type id that follows the string - reader.read_exact(&mut buf[0..str_len + 1]).unwrap(); - TagDesc { - name: std::str::from_utf8(&buf[0..str_len]).unwrap().to_string(), - typeid: buf.pread(str_len).unwrap(), + reader.read_exact(&mut buf[0..str_len + 1])?; + let name = std::str::from_utf8(&buf[0..str_len])?.to_string(); + let typeid = buf.pread(str_len)?; + // if the type id is 7, need to read the types of + // the length and element type, otherwise just turn the + // id into a proper RatType and we're done. + let rad_t = match typeid { + 0..=6 | 8 => typeid.into(), + 7 => { + reader.read_exact(&mut buf[0..2])?; + let t1: RadIntId = buf.pread::(0)?.into(); + let t2: RadAtomicId = buf.pread::(1)?.into(); + RadType::Array(t1, t2) + } + _ => { + bail!("{typeid} is an unrecognized RAD type id") + } + }; + + Ok(TagDesc { + name, + typeid: rad_t, + }) + } + + /// reads a [TagValue] from `reader` based on the type encoded in the + /// current [TagDesc], and returns this [TagValue]. This function + /// will `panic` if it cannot succesfully read the [TagValue] + /// from `reader`. + pub fn value_from_bytes(&self, reader: &mut T) -> TagValue { + let mut small_buf = [0u8; 8]; + match self.typeid { + RadType::Bool => { + let _ = reader.read_exact(&mut small_buf[0..std::mem::size_of::()]); + TagValue::Bool(small_buf[0] > 1) + } + RadType::Int(RadIntId::U8) => { + let _ = reader.read_exact(&mut small_buf[0..std::mem::size_of::()]); + TagValue::U8(small_buf[0]) + } + RadType::Int(RadIntId::U16) => { + let _ = reader.read_exact(&mut small_buf[0..std::mem::size_of::()]); + TagValue::U16(small_buf.pread::(0).unwrap()) + } + RadType::Int(RadIntId::U32) => { + let _ = reader.read_exact(&mut small_buf[0..std::mem::size_of::()]); + TagValue::U32(small_buf.pread::(0).unwrap()) + } + RadType::Int(RadIntId::U64) => { + let _ = reader.read_exact(&mut small_buf[0..std::mem::size_of::()]); + TagValue::U64(small_buf.pread::(0).unwrap()) + } + RadType::Float(RadFloatId::F32) => { + let _ = reader.read_exact(&mut small_buf[0..std::mem::size_of::()]); + TagValue::F32(small_buf.pread::(0).unwrap()) + } + RadType::Float(RadFloatId::F64) => { + let _ = reader.read_exact(&mut small_buf[0..std::mem::size_of::()]); + TagValue::F64(small_buf.pread::(0).unwrap()) + } + RadType::Array(len_t, val_t) => { + let _ = reader.read_exact(&mut small_buf[0..len_t.size_of()]); + let vec_len = len_t.read_into_usize(&small_buf); + if val_t == RadAtomicId::String { + let mut strings = Vec::with_capacity(vec_len); + let sl: u16 = 0; + let mut buf = [0u8; 65536]; + for _ in 0..vec_len { + let _ = reader.read_exact(&mut sl.to_ne_bytes()); + let l = sl as usize; + let _ = reader.read_exact(&mut buf[0..l]); + unsafe { + strings.push(std::str::from_utf8_unchecked(&buf[0..l]).to_string()); + } + } + TagValue::ArrayString(strings) + } else { + let num_bytes = val_t.size_of() * vec_len; + let mut data = vec![0u8; num_bytes]; + let _ = reader.read_exact(data.as_mut_slice()); + match val_t { + RadAtomicId::Int(RadIntId::U8) => TagValue::ArrayU8(data), + RadAtomicId::Int(RadIntId::U16) => { + TagValue::ArrayU16(u8_to_vec_of!(data, u16)) + } + RadAtomicId::Int(RadIntId::U32) => { + TagValue::ArrayU32(u8_to_vec_of!(data, u32)) + } + RadAtomicId::Int(RadIntId::U64) => { + TagValue::ArrayU64(u8_to_vec_of!(data, u64)) + } + RadAtomicId::Float(RadFloatId::F32) => { + TagValue::ArrayF32(u8_to_vec_of!(data, f32)) + } + RadAtomicId::Float(RadFloatId::F64) => { + TagValue::ArrayF64(u8_to_vec_of!(data, f64)) + } + RadAtomicId::String => { + unimplemented!("match of RadAtomicId should not occur in this branch") + } + } + } + } + RadType::String => { + let _ = reader.read_exact(&mut small_buf[0..std::mem::size_of::()]); + let slen = small_buf.pread::(0).unwrap(); + let mut dat: Vec = vec![0_u8; slen as usize]; + let _ = reader.read_exact(dat.as_mut_slice()); + let s = unsafe { String::from_utf8_unchecked(dat) }; + TagValue::String(s) + } } } -} -impl TagSection { - pub fn from_bytes(reader: &mut T) -> TagSection { - let mut buf = [0u8; 2]; - reader.read_exact(&mut buf).unwrap(); - let num_tags = buf.pread::(0).unwrap() as usize; + /// Returns `true` if the [RadType] component of the current [TagDesc] matches + /// the type of the provided [TagValue] of `o` and `false` otherwise. + #[inline] + pub fn matches_value_type(&self, o: &TagValue) -> bool { + match (&self.typeid, o) { + (RadType::Bool, TagValue::Bool(_)) => true, + (RadType::Int(RadIntId::U8), TagValue::U8(_)) => true, + (RadType::Int(RadIntId::U16), TagValue::U16(_)) => true, + (RadType::Int(RadIntId::U32), TagValue::U32(_)) => true, + (RadType::Int(RadIntId::U64), TagValue::U64(_)) => true, + (RadType::Float(RadFloatId::F32), TagValue::F32(_)) => true, + (RadType::Float(RadFloatId::F64), TagValue::F64(_)) => true, + (RadType::Array(_, RadAtomicId::Int(RadIntId::U8)), TagValue::ArrayU8(_)) => true, + (RadType::Array(_, RadAtomicId::Int(RadIntId::U16)), TagValue::ArrayU16(_)) => true, + (RadType::Array(_, RadAtomicId::Int(RadIntId::U32)), TagValue::ArrayU32(_)) => true, + (RadType::Array(_, RadAtomicId::Int(RadIntId::U64)), TagValue::ArrayU64(_)) => true, + (RadType::Array(_, RadAtomicId::Float(RadFloatId::F32)), TagValue::ArrayF32(_)) => true, + (RadType::Array(_, RadAtomicId::Float(RadFloatId::F64)), TagValue::ArrayF64(_)) => true, + (RadType::Array(_, RadAtomicId::String), TagValue::ArrayString(_)) => true, + (RadType::String, TagValue::String(_)) => true, + (_, _) => false, + } + } +} - let mut ts = TagSection { - tags: Vec::with_capacity(num_tags), - }; +/// This type represents a mapping from [TagDesc]s to a corresponding set of +/// values conforming to these descriptions (i.e. in terms of types). The +/// [TagMap] allows you to fetch the value for a specific tag by name or index, or +/// to add values to a corresponding set of descriptions. +#[derive(Debug)] +pub struct TagMap<'a> { + keys: &'a [TagDesc], + dat: Vec, +} - for _ in 0..num_tags { - ts.tags.push(TagDesc::from_bytes(reader)); +impl<'a> TagMap<'a> { + /// Create a new TagMap whose set of keys is determined by + /// the provided `keyset`. This will have one value slot for + /// each provided key. + pub fn with_keyset(keyset: &'a [TagDesc]) -> Self { + Self { + keys: keyset, + dat: Vec::with_capacity(keyset.len()), } + } - ts + /// Try to add the next tag value. If there is space and the type + /// matches, add it and return `true`, otherwise return `false`. + pub fn add_checked(&mut self, val: TagValue) -> bool { + let next_idx = self.dat.len(); + if next_idx >= self.keys.len() || !self.keys[next_idx].matches_value_type(&val) { + false + } else { + self.dat.push(val); + true + } } -} -impl RadHeader { - pub fn from_bytes(reader: &mut T) -> RadHeader { - let mut rh = RadHeader { - is_paired: 0, - ref_count: 0, - ref_names: vec![], - num_chunks: 0, - }; + /// add the next TagValue to the data for this TagMap. + /// This function doesn't check if the type is correct or + /// if too many tag values have been added. It should + /// only be used when one is certain that the next tag value + /// appropriately matches the next available key. + pub fn add(&mut self, val: TagValue) { + self.dat.push(val); + } - // size of the longest allowable string. - let mut buf = [0u8; 65536]; - reader.read_exact(&mut buf[0..9]).unwrap(); - rh.is_paired = buf.pread(0).unwrap(); - rh.ref_count = buf.pread::(1).unwrap(); - - // we know how many names we will read in. - rh.ref_names.reserve_exact(rh.ref_count as usize); - - let mut num_read = 0u64; - while num_read < rh.ref_count { - reader.read_exact(&mut buf[0..2]).unwrap(); - let l: usize = buf.pread::(0).unwrap() as usize; - reader.read_exact(&mut buf[0..l]).unwrap(); - rh.ref_names - .push(std::str::from_utf8(&buf[0..l]).unwrap().to_string()); - num_read += 1; + /// get the value for the tag associated with the name `key`, returns + /// Some(&TagValue) for the appropriate tag if it exists, and None otherwise. + pub fn get(&self, key: &str) -> Option<&TagValue> { + for (k, val) in self.keys.iter().zip(self.dat.iter()) { + if k.name == key { + return Some(val); + } } + None + } + + /// get the value for the tag at index `idx` returns Some(&TagValue) if `idx` + /// is in bounds and None otherwise. + pub fn get_at_index(&self, idx: usize) -> Option<&TagValue> { + self.dat.get(idx) + } +} - reader.read_exact(&mut buf[0..8]).unwrap(); - rh.num_chunks = buf.pread::(0).unwrap(); - rh +impl<'a> std::ops::Index for TagMap<'a> { + type Output = TagValue; + /// Returns a reference to the [TagValue] in the [TagMap] at the + /// provided `index`, panics if `index` is out of bounds. + #[inline] + fn index(&self, index: usize) -> &Self::Output { + &self.dat[index] } +} + +impl TagSection { + /// Attempts to read a [TagSection] from the provided `reader`. If the + /// `reader` is positioned at the start of a valid [TagSection], then this + /// [TagSection] is returned. Otherwise, a description of the error is returned + /// via an [anyhow::Error]. + pub fn from_bytes(reader: &mut T) -> anyhow::Result { + Self::from_bytes_with_label(reader, TagSectionLabel::Unlabeled) + } + + /// Attempts to read a [TagSection] from the provided `reader`. If the + /// `reader` is positioned at the start of a valid [TagSection], then this + /// [TagSection] is returned. Otherwise, a description of the error is returned + /// via an [anyhow::Error]. The returned [TagSection] will be labeled with the + /// provided `label`. + pub fn from_bytes_with_label( + reader: &mut T, + label: TagSectionLabel, + ) -> anyhow::Result { + let mut buf = [0u8; 2]; + reader.read_exact(&mut buf)?; + let num_tags = buf.pread::(0)? as usize; - pub fn from_bam_header(header: &sam::Header) -> RadHeader { - let mut rh = RadHeader { - is_paired: 0, - ref_count: 0, - ref_names: vec![], - num_chunks: 0, + let mut ts = Self { + label, + tags: Vec::with_capacity(num_tags), }; - let ref_seqs = header.reference_sequences(); - rh.ref_count = ref_seqs.len() as u64; - // we know how many names we will read in. - rh.ref_names.reserve_exact(rh.ref_count as usize); - for (k, _v) in ref_seqs.iter() { - rh.ref_names.push(k.to_string()); + for _ in 0..num_tags { + ts.tags.push(TagDesc::from_bytes(reader)?); } - rh + Ok(ts) } - pub fn get_size(&self) -> usize { - let mut tot_size = 0usize; - tot_size += std::mem::size_of::() + std::mem::size_of::(); - for (_i, t) in self.ref_names.iter().map(|a| a.len()).enumerate() { - tot_size += t; + pub fn parse_tags_from_bytes(&self, reader: &mut T) -> anyhow::Result { + // loop over all of the tag descriptions in this section, and parse a + // tag value for each. + //let mut tv = Vec::::new(); + let mut tm = TagMap::with_keyset(&self.tags); + for tag_desc in &self.tags { + tm.add(tag_desc.value_from_bytes(reader)); } - tot_size += std::mem::size_of::(); - tot_size + Ok(tm) } -} -pub fn write_str_bin(v: &str, type_id: &RadIntId, owriter: &mut Cursor>) { - match type_id { - RadIntId::U8 => { - owriter - .write_all(&(v.len() as u8).to_le_bytes()) - .expect("coudn't write to output file"); - } - RadIntId::U16 => { - owriter - .write_all(&(v.len() as u16).to_le_bytes()) - .expect("coudn't write to output file"); + pub fn parse_tags_from_bytes_checked(&self, reader: &mut T) -> anyhow::Result { + // loop over all of the tag descriptions in this section, and parse a + // tag value for each. + //let mut tv = Vec::::new(); + let mut tm = TagMap::with_keyset(&self.tags); + for tag_desc in &self.tags { + if !tm.add_checked(tag_desc.value_from_bytes(reader)) { + panic!("Tried to read value for non-matching type"); + } } - RadIntId::U32 => { - owriter - .write_all(&(v.len() as u32).to_le_bytes()) - .expect("coudn't write to output file"); + Ok(tm) + } + + pub fn get_tag_type(&self, name: &str) -> Option { + for td in &self.tags { + if name == td.name { + return Some(td.typeid); + } } - RadIntId::U64 => { - owriter - .write_all(&(v.len() as u64).to_le_bytes()) - .expect("coudn't write to output file"); + None + } + + /// return `true` if this [TagSection] has a [TagDesc] with a + /// name matching `name`, and `false` otherwise. + pub fn has_tag(&self, name: &str) -> bool { + for td in &self.tags { + if name == td.name { + return true; + } } + false + } +} + +#[cfg(test)] +mod tests { + use crate::rad_types::RadType; + use crate::rad_types::{RadAtomicId, RadIntId, TagSection, TagSectionLabel, TagValue}; + use std::io::Write; + + use super::TagDesc; + #[test] + fn can_parse_simple_tag_desc() { + let mut buf = Vec::::new(); + let tag_name = b"mytag"; + let _ = buf.write_all(&5_u16.to_ne_bytes()); + let _ = buf.write_all(tag_name); + let tag_type = 4_u8; + let _ = buf.write_all(&tag_type.to_ne_bytes()); + + let desc = TagDesc::from_bytes(&mut buf.as_slice()).unwrap(); + assert_eq!(desc.name, "mytag"); + assert_eq!(desc.typeid, RadType::Int(RadIntId::U64)); + } + + #[test] + fn can_parse_array_tag_desc() { + let mut buf = Vec::::new(); + let tag_name = b"mytag"; + let _ = buf.write_all(&5_u16.to_ne_bytes()); + let _ = buf.write_all(tag_name); + let tag_type = 7_u8; + let _ = buf.write_all(&tag_type.to_ne_bytes()); + // length type + let _ = buf.write_all(&1_u8.to_ne_bytes()); + // element type + let _ = buf.write_all(&2_u8.to_ne_bytes()); + + let desc = TagDesc::from_bytes(&mut buf.as_slice()).unwrap(); + assert_eq!(desc.name, "mytag"); + assert_eq!( + desc.typeid, + RadType::Array(RadIntId::U8, RadAtomicId::Int(RadIntId::U16)) + ); + } + + #[test] + fn can_parse_tag_values_from_section() { + let mut buf = Vec::::new(); + let tag_name = b"mytag"; + let _ = buf.write_all(&5_u16.to_ne_bytes()); + let _ = buf.write_all(tag_name); + let tag_type = 7_u8; + let _ = buf.write_all(&tag_type.to_ne_bytes()); + // length type + let _ = buf.write_all(&1_u8.to_ne_bytes()); + // element type + let _ = buf.write_all(&2_u8.to_ne_bytes()); + + let desc = TagDesc::from_bytes(&mut buf.as_slice()).unwrap(); + assert_eq!(desc.name, "mytag"); + assert_eq!( + desc.typeid, + RadType::Array(RadIntId::U8, RadAtomicId::Int(RadIntId::U16)) + ); + + buf.clear(); + let tag_name = b"stringtag"; + let _ = buf.write_all(&9_u16.to_ne_bytes()); + let _ = buf.write_all(tag_name); + // type id + let _ = buf.write_all(&8_u8.to_ne_bytes()); + let desc_str = TagDesc::from_bytes(&mut buf.as_slice()).unwrap(); + + let tag_sec = TagSection { + label: TagSectionLabel::FileTags, + tags: vec![desc, desc_str], + }; + + buf.clear(); + let _ = buf.write_all(&3_u8.to_ne_bytes()); + let _ = buf.write_all(&1_u16.to_ne_bytes()); + let _ = buf.write_all(&2_u16.to_ne_bytes()); + let _ = buf.write_all(&3_u16.to_ne_bytes()); + + let _ = buf.write_all(&6_u16.to_ne_bytes()); + let _ = buf.write_all(b"hi_rad"); + + let map = tag_sec.parse_tags_from_bytes(&mut buf.as_slice()).unwrap(); + assert_eq!( + map.get("mytag").unwrap(), + &TagValue::ArrayU16(vec![1, 2, 3]) + ); + assert_eq!( + map.get("stringtag").unwrap(), + &TagValue::String(String::from("hi_rad")) + ); + + assert_eq!( + map.get_at_index(0).unwrap(), + &TagValue::ArrayU16(vec![1, 2, 3]) + ); + assert_eq!( + map.get_at_index(1).unwrap(), + &TagValue::String(String::from("hi_rad")) + ); + + let map = tag_sec + .parse_tags_from_bytes_checked(&mut buf.as_slice()) + .unwrap(); + assert_eq!( + map.get("mytag").unwrap(), + &TagValue::ArrayU16(vec![1, 2, 3]) + ); + assert_eq!( + map.get("stringtag").unwrap(), + &TagValue::String(String::from("hi_rad")) + ); + + assert_eq!( + map.get_at_index(0).unwrap(), + &TagValue::ArrayU16(vec![1, 2, 3]) + ); + assert_eq!( + map.get_at_index(1).unwrap(), + &TagValue::String(String::from("hi_rad")) + ); + + assert_eq!(map[0], TagValue::ArrayU16(vec![1, 2, 3])); + assert_eq!(map[1], TagValue::String(String::from("hi_rad"))); } - owriter - .write_all(v.as_bytes()) - .expect("coudn't write to output file"); } diff --git a/src/record.rs b/src/record.rs new file mode 100644 index 0000000..1df3c65 --- /dev/null +++ b/src/record.rs @@ -0,0 +1,332 @@ +use crate::{ + io as rad_io, + rad_types::{MappedFragmentOrientation, MappingType, RadIntId, RadType, TagSection}, + utils, +}; +use anyhow::{self, bail}; +use bio_types::strand::*; +use scroll::Pread; +use std::io::Read; +use std::mem; + +/// A concrete struct representing a [MappedRecord] +/// for reads processed upstream with `piscem` (or `salmon alevin`). +/// This represents the set of alignments and relevant information +/// for a basic alevin-fry record. +#[derive(Debug)] +pub struct AlevinFryReadRecord { + pub bc: u64, + pub umi: u64, + pub dirs: Vec, + pub refs: Vec, +} + +#[derive(Debug)] +pub struct PiscemBulkReadRecord { + pub frag_type: u8, + pub dirs: Vec, + pub refs: Vec, + pub positions: Vec, + pub frag_lengths: Vec, +} + +/// This trait represents a mapped read record that should be stored +/// in the [crate::chunk::Chunk] of a RAD file. The [crate::chunk::Chunk] type is parameterized on +/// some concrete struct that must implement this [MappedRecord] trait. +/// This trat defines the necessary functions required to be able to parse +/// the read record from the underlying reader, as well as the associated +/// types that are necessary to provide the context to perform this parsing. +pub trait MappedRecord { + /// the information necessary to be able to correctly + /// parse a concrete instance of a struct implementing + /// [MappedRecord] from an input stream. This should + /// encapsulate any context necessary to perform such + /// parsing. + type ParsingContext; + /// The information that should be returned if one wishes + /// to peek at the next record in the input stream. + type PeekResult; + + fn peek_record(buf: &[u8], ctx: &Self::ParsingContext) -> Self::PeekResult; + fn from_bytes_with_context(reader: &mut T, ctx: &Self::ParsingContext) -> Self; +} + +/// This trait allows obtaining and passing along necessary information that +/// may be required for a [MappedRecord] to be properly parsed from a file. +/// Typically, this information will be relevant information about the tags +/// that are used for parsing these records. It gets information about the +/// file, read, and alignment-level [TagSection]s from the [crate::header::RadPrelude] and +/// can then copy any information that may be later necessary for parsing. +pub trait RecordContext { + fn get_context_from_tag_section( + ft: &TagSection, + rt: &TagSection, + at: &TagSection, + ) -> anyhow::Result + where + Self: Sized; +} + +/// context needed to read an alevin-fry record +/// (the types of the barcode and umi) +#[derive(Debug)] +pub struct AlevinFryRecordContext { + pub bct: RadIntId, + pub umit: RadIntId, +} + +impl RecordContext for AlevinFryRecordContext { + /// Currently, the [AlevinFryRecordContext] only cares about and provides the read tags that + /// correspond to the length of the barcode and the UMI. Here, these are parsed from the + /// corresponding [TagSection]. + fn get_context_from_tag_section( + _ft: &TagSection, + rt: &TagSection, + _at: &TagSection, + ) -> anyhow::Result { + let bct = rt + .get_tag_type("b") + .expect("alevin-fry record context requires a \'b\' read-level tag"); + let umit = rt + .get_tag_type("b") + .expect("alevin-fry record context requires a \'u\' read-level tag"); + if let (RadType::Int(x), RadType::Int(y)) = (bct, umit) { + Ok(Self { bct: x, umit: y }) + } else { + bail!("alevin-fry record context requires that b and u tags are of type RadType::Int"); + } + } +} + +impl AlevinFryRecordContext { + pub fn from_bct_umit(bct: RadIntId, umit: RadIntId) -> Self { + Self { bct, umit } + } +} + +#[derive(Debug)] +pub struct PiscemBulkRecordContext { + pub frag_map_t: RadIntId, +} + +impl RecordContext for PiscemBulkRecordContext { + fn get_context_from_tag_section( + _ft: &TagSection, + rt: &TagSection, + _at: &TagSection, + ) -> anyhow::Result { + let frag_map_t = rt + .get_tag_type("frag_map_type") + .expect("psicem bulk record cantext requires a \"frag_map_type\" read-level tag"); + if let RadType::Int(x) = frag_map_t { + Ok(Self { frag_map_t: x }) + } else { + bail!("piscem bulk record context requries that \"frag_map_type\" tag is of type RadType::Int"); + } + } +} + +impl MappedRecord for PiscemBulkReadRecord { + type ParsingContext = PiscemBulkRecordContext; + type PeekResult = Option; + + #[inline] + fn from_bytes_with_context(reader: &mut T, ctx: &Self::ParsingContext) -> Self { + const MASK_LOWER_30_BITS: u32 = 0xC0000000; + const MASK_UPPER_2_BITS: u32 = 0x3FFFFFFF; + let mut rbuf = [0u8; 255]; + + reader.read_exact(&mut rbuf[0..4]).unwrap(); + let na = rbuf.pread::(0).unwrap(); + let fmt = rad_io::read_into_u64(reader, &ctx.frag_map_t); + let f = MappingType::from_u8(fmt as u8); + + let mut rec = Self { + frag_type: fmt as u8, + dirs: Vec::with_capacity(na as usize), + refs: Vec::with_capacity(na as usize), + positions: Vec::with_capacity(na as usize), + frag_lengths: Vec::with_capacity(na as usize), + }; + + //println!("number of records : {:?}",na); + + for _ in 0..(na as usize) { + reader.read_exact(&mut rbuf[0..4]).unwrap(); + let v = rbuf.pread::(0).unwrap(); + + let dir_int = (v & MASK_LOWER_30_BITS) >> 30; + let dir = MappedFragmentOrientation::from_u32_paired_status(dir_int, f); + rec.dirs.push(dir); + rec.refs.push(v & MASK_UPPER_2_BITS); + // position + reader.read_exact(&mut rbuf[0..4]).unwrap(); + let pos = rbuf.pread::(0).unwrap(); + rec.positions.push(pos); + // length + reader.read_exact(&mut rbuf[0..2]).unwrap(); + let flen = rbuf.pread::(0).unwrap(); + rec.frag_lengths.push(flen); + } + + rec + } + + #[inline] + fn peek_record(_buf: &[u8], _ctx: &Self::ParsingContext) -> Self::PeekResult { + unimplemented!("Currently there is no implementation for peek_record for PiscemBulkReadRecord. This should not be needed"); + } +} + +impl MappedRecord for AlevinFryReadRecord { + type ParsingContext = AlevinFryRecordContext; + type PeekResult = (u64, u64); + + #[inline] + fn peek_record(buf: &[u8], ctx: &Self::ParsingContext) -> Self::PeekResult { + let na_size = mem::size_of::(); + let bc_size = ctx.bct.bytes_for_type(); + + let _na = buf.pread::(0).unwrap(); + + let bc = match ctx.bct { + RadIntId::U8 => buf.pread::(na_size).unwrap() as u64, + RadIntId::U16 => buf.pread::(na_size).unwrap() as u64, + RadIntId::U32 => buf.pread::(na_size).unwrap() as u64, + RadIntId::U64 => buf.pread::(na_size).unwrap(), + }; + let umi = match ctx.umit { + RadIntId::U8 => buf.pread::(na_size + bc_size).unwrap() as u64, + RadIntId::U16 => buf.pread::(na_size + bc_size).unwrap() as u64, + RadIntId::U32 => buf.pread::(na_size + bc_size).unwrap() as u64, + RadIntId::U64 => buf.pread::(na_size + bc_size).unwrap(), + }; + (bc, umi) + } + + #[inline] + fn from_bytes_with_context(reader: &mut T, ctx: &Self::ParsingContext) -> Self { + let mut rbuf = [0u8; 255]; + + let (bc, umi, na) = Self::from_bytes_record_header(reader, &ctx.bct, &ctx.umit); + + let mut rec = Self { + bc, + umi, + dirs: Vec::with_capacity(na as usize), + refs: Vec::with_capacity(na as usize), + }; + + for _ in 0..(na as usize) { + reader.read_exact(&mut rbuf[0..4]).unwrap(); + let v = rbuf.pread::(0).unwrap(); + let dir = (v & utils::MASK_LOWER_31_U32) != 0; + rec.dirs.push(dir); + rec.refs.push(v & utils::MASK_TOP_BIT_U32); + } + rec + } +} + +impl AlevinFryReadRecord { + /// Returns `true` if this [AlevinFryReadRecord] contains no references and + /// `false` otherwise. + pub fn is_empty(&self) -> bool { + self.refs.is_empty() + } + + /// Obtains the next [AlevinFryReadRecord] in the stream from the reader `reader`. + /// The barcode should be encoded with the [RadIntId] type `bct` and + /// the umi should be encoded with the [RadIntId] type `umit`. + pub fn from_bytes(reader: &mut T, bct: &RadIntId, umit: &RadIntId) -> Self { + let mut rbuf = [0u8; 255]; + + let (bc, umi, na) = Self::from_bytes_record_header(reader, bct, umit); + + let mut rec = Self { + bc, + umi, + dirs: Vec::with_capacity(na as usize), + refs: Vec::with_capacity(na as usize), + }; + + for _ in 0..(na as usize) { + reader.read_exact(&mut rbuf[0..4]).unwrap(); + let v = rbuf.pread::(0).unwrap(); + let dir = (v & utils::MASK_LOWER_31_U32) != 0; + rec.dirs.push(dir); + rec.refs.push(v & utils::MASK_TOP_BIT_U32); + } + rec + } + + #[inline] + pub fn from_bytes_record_header( + reader: &mut T, + bct: &RadIntId, + umit: &RadIntId, + ) -> (u64, u64, u32) { + let mut rbuf = [0u8; 4]; + reader.read_exact(&mut rbuf).unwrap(); + let na = u32::from_le_bytes(rbuf); //.pread::(0).unwrap(); + let bc = rad_io::read_into_u64(reader, bct); + let umi = rad_io::read_into_u64(reader, umit); + (bc, umi, na) + } + + /// Read the next [AlevinFryReadRecord] from `reader`, but retain only those + /// alignment records that match the prescribed orientation provided in + /// `expected_ori` (which is a [Strand]). This function assumes the + /// read header has already been parsed, and just reads the raw + /// record contents consisting of the references and directions. + #[inline] + pub fn from_bytes_with_header_keep_ori( + reader: &mut T, + bc: u64, + umi: u64, + na: u32, + expected_ori: &Strand, + ) -> Self { + let mut rbuf = [0u8; 255]; + let mut rec = Self { + bc, + umi, + dirs: Vec::with_capacity(na as usize), + refs: Vec::with_capacity(na as usize), + }; + + for _ in 0..(na as usize) { + reader.read_exact(&mut rbuf[0..4]).unwrap(); + let v = rbuf.pread::(0).unwrap(); + + // fw if the leftmost bit is 1, otherwise rc + let strand = if (v & utils::MASK_LOWER_31_U32) > 0 { + Strand::Forward + } else { + Strand::Reverse + }; + + if expected_ori.same(&strand) || expected_ori.is_unknown() { + rec.refs.push(v & utils::MASK_TOP_BIT_U32); + } + } + + // make sure these are sorted in this step. + rec.refs.sort_unstable(); + rec + } + + /// Read the next [AlevinFryReadRecord], including the header, from `reader`, but + /// retain only those alignment records that match the prescribed + /// orientation provided in `expected_ori` (which is a [Strand]). + #[inline] + pub fn from_bytes_keep_ori( + reader: &mut T, + bct: &RadIntId, + umit: &RadIntId, + expected_ori: &Strand, + ) -> Self { + let (bc, umi, na) = Self::from_bytes_record_header(reader, bct, umit); + Self::from_bytes_with_header_keep_ori(reader, bc, umi, na, expected_ori) + } +}