Merge pull request #21 from COMBINE-lab/develop

feat: major rework of libradicl
COMBINE-lab · Feb 26, 2024 · 30db3a6 · 30db3a6
2 parents 321d1c0 + a163b99
commit 30db3a6
Show file tree

Hide file tree

Showing 12 changed files with 1,474 additions and 387 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "libradicl"
-version = "0.7.0"
+version = "0.8.0"
 authors = [
   "Avi Srivastava <[email protected]>",
   "Hirak Sarkar <[email protected]>",
@@ -15,7 +15,7 @@ readme = "README.md"
 repository = "https://github.com/COMBINE-lab/alevin-fry"
 homepage = "https://github.com/COMBINE-lab/alevin-fry"
 documentation = "https://alevin-fry.readthedocs.io/en/latest/"
-include = ["src/*.rs", "/Cargo.toml", "/README.md", "/LICENSE"]
+include = ["src/*.rs", "examples/*.rs", "/Cargo.toml", "/README.md", "/LICENSE"]
 keywords = [
   "single-cell",
   "preprocessing",
@@ -29,10 +29,14 @@ categories = ["command-line-utilities", "science"]
 snap = "1"
 scroll = "0.12.0"
 num = "0.4.1"
-ahash = "0.8.8"
-serde = { version = "1.0.196", features = ["derive"] }
+ahash = "0.8.9"
+serde = { version = "1.0.197", features = ["derive"] }
 dashmap = "^5.5.3"
 bio-types = "1.0.1"
 smallvec = "1.13.1"
 noodles-bam = "0.56.0"
 noodles-sam = "0.53.0"
+anyhow = "1.0.80"
+
+[dev-dependencies]
+needletail="0.5.1"
diff --git a/examples/read_chunk_bulk.rs b/examples/read_chunk_bulk.rs
@@ -0,0 +1,32 @@
+use anyhow;
+use libradicl;
+use libradicl::chunk::Chunk;
+use libradicl::record::{PiscemBulkReadRecord, PiscemBulkRecordContext};
+use std::io::BufReader;
+
+fn main() -> anyhow::Result<()> {
+    let fname = std::env::args().nth(1).expect("input filename");
+    let f = std::fs::File::open(&fname)?;
+    let mut ifile = BufReader::new(f);
+    let p = libradicl::header::RadPrelude::from_bytes(&mut ifile)?;
+    if let Ok(summary) = p.summary(None) {
+        println!("{}", summary);
+    }
+
+    let _tag_map = p.file_tags.parse_tags_from_bytes_checked(&mut ifile)?;
+
+    // Any extra context we may need to parse the records. In this case, it's the
+    // size of the barcode and the umi.
+    let tag_context = p.get_record_context::<PiscemBulkRecordContext>()?;
+    let first_chunk = Chunk::<PiscemBulkReadRecord>::from_bytes(&mut ifile, &tag_context);
+    println!(
+        "Chunk :: nbytes: {}, nrecs: {}",
+        first_chunk.nbytes, first_chunk.nrec
+    );
+    assert_eq!(first_chunk.nrec as usize, first_chunk.reads.len());
+    for (i, r) in first_chunk.reads.iter().take(10).enumerate() {
+        println!("record {i}: {:?}", r);
+    }
+
+    Ok(())
+}
diff --git a/examples/read_chunk_single_cell.rs b/examples/read_chunk_single_cell.rs
@@ -0,0 +1,32 @@
+use anyhow;
+use libradicl::chunk::Chunk;
+use libradicl::header;
+use libradicl::record::{AlevinFryReadRecord, AlevinFryRecordContext};
+use std::io::BufReader;
+
+fn main() -> anyhow::Result<()> {
+    let fname = std::env::args().nth(1).expect("input filename");
+    let f = std::fs::File::open(&fname)?;
+    let mut ifile = BufReader::new(f);
+    let p = header::RadPrelude::from_bytes(&mut ifile)?;
+    if let Ok(summary) = p.summary(None) {
+        println!("{}", summary);
+    }
+
+    let tag_map = p.file_tags.parse_tags_from_bytes_checked(&mut ifile)?;
+    println!("tag map {:?}\n", tag_map);
+    // Any extra context we may need to parse the records. In this case, it's the
+    // size of the barcode and the umi.
+    let tag_context = p.get_record_context::<AlevinFryRecordContext>()?;
+    let first_chunk = Chunk::<AlevinFryReadRecord>::from_bytes(&mut ifile, &tag_context);
+    println!(
+        "Chunk :: nbytes: {}, nrecs: {}",
+        first_chunk.nbytes, first_chunk.nrec
+    );
+    assert_eq!(first_chunk.nrec as usize, first_chunk.reads.len());
+    for (i, r) in first_chunk.reads.iter().take(10).enumerate() {
+        println!("record {i}: {:?}", r);
+    }
+
+    Ok(())
+}
diff --git a/examples/read_header.rs b/examples/read_header.rs
@@ -0,0 +1,48 @@
+use anyhow;
+use libradicl::{self, rad_types};
+use std::io::BufReader;
+
+fn main() -> anyhow::Result<()> {
+    let fname = std::env::args().nth(1).expect("input filename");
+    /*
+    {
+        let f = std::fs::File::open(&fname)?;
+        let mut ifile = BufReader::new(f);
+        let h = rad_types::RadHeader::from_bytes(&mut ifile)?;
+
+        if let Ok(summary) = h.summary(None) {
+            println!("{}", summary);
+        }
+
+        let ts = rad_types::TagSection::from_bytes(&mut ifile)?;
+        println!("File-level tags: {ts:?}");
+
+        let ts = rad_types::TagSection::from_bytes(&mut ifile)?;
+        println!("Read-level tags: {ts:?}");
+
+        let ts = rad_types::TagSection::from_bytes(&mut ifile)?;
+        println!("Alignment-level tags: {ts:?}");
+    }
+    */
+
+    println!("\n");
+
+    {
+        let f = std::fs::File::open(&fname)?;
+        let mut ifile = BufReader::new(f);
+        let p = libradicl::header::RadPrelude::from_bytes(&mut ifile)?;
+        if let Ok(summary) = p.summary(None) {
+            println!("{}", summary);
+        }
+        let ftmp = p.file_tags.parse_tags_from_bytes(&mut ifile)?;
+        if let Some(rad_types::TagValue::ArrayU32(v)) = ftmp.get("ref_lengths") {
+            println!(
+                "file tag values: {:?}",
+                v.iter().take(10).collect::<Vec<&u32>>()
+            );
+        } else {
+            println!("file-level tags: {:?}", &ftmp);
+        }
+    }
+    Ok(())
+}
diff --git a/src/chunk.rs b/src/chunk.rs
@@ -0,0 +1,69 @@
+use crate::{self as libradicl};
+use libradicl::record::MappedRecord;
+use scroll::Pread;
+use std::io::{Cursor, Read};
+
+#[derive(Debug)]
+pub struct Chunk<T: MappedRecord> {
+    pub nbytes: u32,
+    pub nrec: u32,
+    pub reads: Vec<T>,
+}
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub struct CorrectedCbChunk {
+    pub(crate) remaining_records: u32,
+    pub(crate) corrected_bc: u64,
+    pub(crate) nrec: u32,
+    pub(crate) data: Cursor<Vec<u8>>, /*,
+                                      umis: Vec<u64>,
+                                      ref_offsets: Vec<u32>,
+                                      ref_ids: Vec<u32>,
+                                      */
+}
+
+pub struct ChunkConfig {
+    pub num_chunks: u64,
+    pub bc_type: u8,
+    pub umi_type: u8,
+}
+
+impl<R: MappedRecord> Chunk<R> {
+    /// Read the header of the next [Chunk] from the provided `reader`. This
+    /// function returns a tuple representing the number of bytes and number of
+    /// records, respectively, in the chunk.
+    #[inline]
+    pub fn read_header<T: Read>(reader: &mut T) -> (u32, u32) {
+        let mut buf = [0u8; 8];
+        reader.read_exact(&mut buf).unwrap();
+        let nbytes = buf.pread::<u32>(0).unwrap();
+        let nrec = buf.pread::<u32>(4).unwrap();
+        (nbytes, nrec)
+    }
+
+    /// Read the next [Chunk] from the provided reader and return it.
+    #[inline]
+    pub fn from_bytes<T: Read>(reader: &mut T, ctx: &R::ParsingContext) -> Self {
+        let (nbytes, nrec) = Self::read_header(reader);
+        let mut c = Self {
+            nbytes,
+            nrec,
+            reads: Vec::<R>::with_capacity(nrec as usize),
+        };
+
+        for _ in 0..(nrec as usize) {
+            c.reads.push(R::from_bytes_with_context(reader, ctx));
+        }
+
+        c
+    }
+
+    /// Peeks to the first [libradicl::record::AlevinFryReadRecord] in the buffer `buf`, and returns
+    /// the barcode and umi associated with this record.  It is assumed
+    /// that there is at least one [libradicl::record::AlevinFryReadRecord] present in the buffer.
+    #[inline]
+    pub fn peek_record(buf: &[u8], ctx: &R::ParsingContext) -> R::PeekResult {
+        R::peek_record(buf, ctx)
+    }
+}
diff --git a/src/constants.rs b/src/constants.rs
@@ -0,0 +1 @@
+pub(crate) const MAX_REF_NAME_LEN: usize = 65536;
diff --git a/src/header.rs b/src/header.rs
@@ -0,0 +1,176 @@
+use crate::{self as libradicl, constants};
+use libradicl::rad_types::{TagSection, TagSectionLabel};
+use libradicl::record::RecordContext;
+use noodles_sam as sam;
+use scroll::Pread;
+use std::io::Read;
+
+/// The [RadPrelude] groups together the [RadHeader]
+/// as well as the relevant top-level [TagSection]s of the file.
+/// It constitutes everything in the initial file prior to the
+/// start of the first [libradicl::chunk::Chunk].
+pub struct RadPrelude {
+    pub hdr: RadHeader,
+    pub file_tags: TagSection,
+    pub read_tags: TagSection,
+    pub aln_tags: TagSection,
+}
+
+/// The [RadHeader] contains the relevant information about the
+/// references against which the reads in this file were mapped and
+/// information about the way in which mapping was performed.
+pub struct RadHeader {
+    pub is_paired: u8,
+    pub ref_count: u64,
+    pub ref_names: Vec<String>,
+    pub num_chunks: u64,
+}
+
+impl Default for RadHeader {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RadHeader {
+    /// Create a new empty [RadHeader]
+    pub fn new() -> Self {
+        Self {
+            is_paired: 0,
+            ref_count: 0,
+            ref_names: vec![],
+            num_chunks: 0,
+        }
+    }
+
+    /// Create and return a new [RadHeader] by reading the contents of the
+    /// `reader`. If the reader is positioned such that a valid [RadHeader] comes
+    /// next, then this function returns [Ok(RadHeader)], otherwise, it returns
+    /// an [anyhow::Error] explaining the failure to parse the [RadHeader].
+    pub fn from_bytes<T: Read>(reader: &mut T) -> anyhow::Result<RadHeader> {
+        let mut rh = RadHeader::new();
+
+        // size of the longest allowable string.
+        let mut buf = [0u8; constants::MAX_REF_NAME_LEN];
+        reader.read_exact(&mut buf[0..9])?;
+        rh.is_paired = buf.pread(0)?;
+        rh.ref_count = buf.pread::<u64>(1)?;
+
+        // we know how many names we will read in.
+        rh.ref_names.reserve_exact(rh.ref_count as usize);
+
+        let mut num_read = 0u64;
+        while num_read < rh.ref_count {
+            // the length of the string
+            reader.read_exact(&mut buf[0..2])?;
+            let l: usize = buf.pread::<u16>(0)? as usize;
+            // the string itself
+            reader.read_exact(&mut buf[0..l])?;
+            rh.ref_names
+                .push(std::str::from_utf8(&buf[0..l])?.to_string());
+            num_read += 1;
+        }
+
+        reader.read_exact(&mut buf[0..8])?;
+        rh.num_chunks = buf.pread::<u64>(0)?;
+        Ok(rh)
+    }
+
+    /// Create and return a [RadHeader] from the provided BAM/SAM header
+    /// (represented by the noodles [sam::Header] `header`).  
+    /// **Note**: The returned [RadHeader] will *not* have a value for the `num_chunks`
+    /// field, which will remain set at 0, nor will it set a meaningful value for the
+    /// `is_paried` flag, since the SAM/BAM header itself doesn't encode this information.
+    pub fn from_bam_header(header: &sam::Header) -> RadHeader {
+        let mut rh = RadHeader {
+            is_paired: 0,
+            ref_count: 0,
+            ref_names: vec![],
+            num_chunks: 0,
+        };
+
+        let ref_seqs = header.reference_sequences();
+        rh.ref_count = ref_seqs.len() as u64;
+        // we know how many names we will read in.
+        rh.ref_names.reserve_exact(rh.ref_count as usize);
+        for (k, _v) in ref_seqs.iter() {
+            rh.ref_names.push(k.to_string());
+        }
+        rh
+    }
+
+    /// Returns the size, in bytes, that this [RadHeader] will take
+    /// if written to an output stream.
+    pub fn get_size(&self) -> usize {
+        let mut tot_size = 0usize;
+        tot_size += std::mem::size_of_val(&self.is_paired) + std::mem::size_of_val(&self.ref_count);
+        // each name takes 2 bytes for the length, plus the actual
+        // number of bytes required by the string itself.
+        for t in self.ref_names.iter().map(|a| a.len()) {
+            tot_size += std::mem::size_of::<u16>() + t;
+        }
+        tot_size += std::mem::size_of_val(&self.num_chunks);
+        tot_size
+    }
+
+    pub fn summary(&self, num_refs: Option<usize>) -> anyhow::Result<String> {
+        use std::fmt::Write as _;
+        let mut s = String::new();
+        writeln!(&mut s, "RadHeader {{")?;
+        writeln!(&mut s, "is_paired: {}", self.is_paired)?;
+        writeln!(&mut s, "ref_count: {}", self.ref_count)?;
+
+        let refs_to_print = match num_refs {
+            Some(rcount) => rcount.min(self.ref_count as usize),
+            None => (self.ref_count as usize).min(10_usize),
+        };
+
+        for rn in self.ref_names.iter().take(refs_to_print) {
+            writeln!(&mut s, "  ref: {}", rn)?;
+        }
+        writeln!(&mut s, "  ...")?;
+
+        writeln!(&mut s, "num_chunks: {}", self.num_chunks)?;
+        writeln!(&mut s, "}}")?;
+        Ok(s)
+    }
+}
+
+impl RadPrelude {
+    pub fn from_bytes<T: Read>(reader: &mut T) -> anyhow::Result<Self> {
+        let hdr = RadHeader::from_bytes(reader)?;
+        let file_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::FileTags)?;
+        let read_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::ReadTags)?;
+        let aln_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::AlignmentTags)?;
+
+        //let file_tag_vals = file_tags.parse_tags_from_bytes(reader)?;
+        //println!("file-level tag values: {:?}", file_tag_vals);
+
+        Ok(Self {
+            hdr,
+            file_tags,
+            read_tags,
+            aln_tags,
+        })
+    }
+
+    pub fn summary(&self, num_refs: Option<usize>) -> anyhow::Result<String> {
+        use std::fmt::Write as _;
+        let mut s = self.hdr.summary(num_refs)?;
+        writeln!(&mut s, "[[{:?}]]", self.file_tags)?;
+        writeln!(&mut s, "[[{:?}]]", self.read_tags)?;
+        writeln!(&mut s, "[[{:?}]]", self.aln_tags)?;
+        //writeln!(&mut s, "file-level tag values [{:?}]", self.file_tag_vals)?;
+        Ok(s)
+    }
+
+    /// Obtain a [RecordContext] for a record of type `R` from this prelude, by
+    /// using the associated [TagSection]s.  **Note**: Since this function
+    /// constructs the resulting `R` itself, and doesn't take any `R` parameter,
+    /// then it must always be invoked with the proper
+    /// [turbofish](https://doc.rust-lang.org/1.30.0/book/2018-edition/appendix-02-operators.html?highlight=turbofish#non-operator-symbols)
+    /// notation.
+    pub fn get_record_context<R: RecordContext>(&self) -> anyhow::Result<R> {
+        R::get_context_from_tag_section(&self.file_tags, &self.read_tags, &self.aln_tags)
+    }
+}