Skip to content

Commit

Permalink
Merge pull request #21 from COMBINE-lab/develop
Browse files Browse the repository at this point in the history
feat: major rework of libradicl
  • Loading branch information
rob-p committed Feb 26, 2024
2 parents 321d1c0 + a163b99 commit 30db3a6
Show file tree
Hide file tree
Showing 12 changed files with 1,474 additions and 387 deletions.
12 changes: 8 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "libradicl"
version = "0.7.0"
version = "0.8.0"
authors = [
"Avi Srivastava <[email protected]>",
"Hirak Sarkar <[email protected]>",
Expand All @@ -15,7 +15,7 @@ readme = "README.md"
repository = "https://github.com/COMBINE-lab/alevin-fry"
homepage = "https://github.com/COMBINE-lab/alevin-fry"
documentation = "https://alevin-fry.readthedocs.io/en/latest/"
include = ["src/*.rs", "/Cargo.toml", "/README.md", "/LICENSE"]
include = ["src/*.rs", "examples/*.rs", "/Cargo.toml", "/README.md", "/LICENSE"]
keywords = [
"single-cell",
"preprocessing",
Expand All @@ -29,10 +29,14 @@ categories = ["command-line-utilities", "science"]
snap = "1"
scroll = "0.12.0"
num = "0.4.1"
ahash = "0.8.8"
serde = { version = "1.0.196", features = ["derive"] }
ahash = "0.8.9"
serde = { version = "1.0.197", features = ["derive"] }
dashmap = "^5.5.3"
bio-types = "1.0.1"
smallvec = "1.13.1"
noodles-bam = "0.56.0"
noodles-sam = "0.53.0"
anyhow = "1.0.80"

[dev-dependencies]
needletail="0.5.1"
32 changes: 32 additions & 0 deletions examples/read_chunk_bulk.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use anyhow;
use libradicl;
use libradicl::chunk::Chunk;
use libradicl::record::{PiscemBulkReadRecord, PiscemBulkRecordContext};
use std::io::BufReader;

fn main() -> anyhow::Result<()> {
let fname = std::env::args().nth(1).expect("input filename");
let f = std::fs::File::open(&fname)?;
let mut ifile = BufReader::new(f);
let p = libradicl::header::RadPrelude::from_bytes(&mut ifile)?;
if let Ok(summary) = p.summary(None) {
println!("{}", summary);
}

let _tag_map = p.file_tags.parse_tags_from_bytes_checked(&mut ifile)?;

// Any extra context we may need to parse the records. In this case, it's the
// size of the barcode and the umi.
let tag_context = p.get_record_context::<PiscemBulkRecordContext>()?;
let first_chunk = Chunk::<PiscemBulkReadRecord>::from_bytes(&mut ifile, &tag_context);
println!(
"Chunk :: nbytes: {}, nrecs: {}",
first_chunk.nbytes, first_chunk.nrec
);
assert_eq!(first_chunk.nrec as usize, first_chunk.reads.len());
for (i, r) in first_chunk.reads.iter().take(10).enumerate() {
println!("record {i}: {:?}", r);
}

Ok(())
}
32 changes: 32 additions & 0 deletions examples/read_chunk_single_cell.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use anyhow;
use libradicl::chunk::Chunk;
use libradicl::header;
use libradicl::record::{AlevinFryReadRecord, AlevinFryRecordContext};
use std::io::BufReader;

fn main() -> anyhow::Result<()> {
let fname = std::env::args().nth(1).expect("input filename");
let f = std::fs::File::open(&fname)?;
let mut ifile = BufReader::new(f);
let p = header::RadPrelude::from_bytes(&mut ifile)?;
if let Ok(summary) = p.summary(None) {
println!("{}", summary);
}

let tag_map = p.file_tags.parse_tags_from_bytes_checked(&mut ifile)?;
println!("tag map {:?}\n", tag_map);
// Any extra context we may need to parse the records. In this case, it's the
// size of the barcode and the umi.
let tag_context = p.get_record_context::<AlevinFryRecordContext>()?;
let first_chunk = Chunk::<AlevinFryReadRecord>::from_bytes(&mut ifile, &tag_context);
println!(
"Chunk :: nbytes: {}, nrecs: {}",
first_chunk.nbytes, first_chunk.nrec
);
assert_eq!(first_chunk.nrec as usize, first_chunk.reads.len());
for (i, r) in first_chunk.reads.iter().take(10).enumerate() {
println!("record {i}: {:?}", r);
}

Ok(())
}
48 changes: 48 additions & 0 deletions examples/read_header.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use anyhow;
use libradicl::{self, rad_types};
use std::io::BufReader;

fn main() -> anyhow::Result<()> {
let fname = std::env::args().nth(1).expect("input filename");
/*
{
let f = std::fs::File::open(&fname)?;
let mut ifile = BufReader::new(f);
let h = rad_types::RadHeader::from_bytes(&mut ifile)?;
if let Ok(summary) = h.summary(None) {
println!("{}", summary);
}
let ts = rad_types::TagSection::from_bytes(&mut ifile)?;
println!("File-level tags: {ts:?}");
let ts = rad_types::TagSection::from_bytes(&mut ifile)?;
println!("Read-level tags: {ts:?}");
let ts = rad_types::TagSection::from_bytes(&mut ifile)?;
println!("Alignment-level tags: {ts:?}");
}
*/

println!("\n");

{
let f = std::fs::File::open(&fname)?;
let mut ifile = BufReader::new(f);
let p = libradicl::header::RadPrelude::from_bytes(&mut ifile)?;
if let Ok(summary) = p.summary(None) {
println!("{}", summary);
}
let ftmp = p.file_tags.parse_tags_from_bytes(&mut ifile)?;
if let Some(rad_types::TagValue::ArrayU32(v)) = ftmp.get("ref_lengths") {
println!(
"file tag values: {:?}",
v.iter().take(10).collect::<Vec<&u32>>()
);
} else {
println!("file-level tags: {:?}", &ftmp);
}
}
Ok(())
}
69 changes: 69 additions & 0 deletions src/chunk.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
use crate::{self as libradicl};
use libradicl::record::MappedRecord;
use scroll::Pread;
use std::io::{Cursor, Read};

#[derive(Debug)]
pub struct Chunk<T: MappedRecord> {
pub nbytes: u32,
pub nrec: u32,
pub reads: Vec<T>,
}

#[derive(Debug)]
#[allow(dead_code)]
pub struct CorrectedCbChunk {
pub(crate) remaining_records: u32,
pub(crate) corrected_bc: u64,
pub(crate) nrec: u32,
pub(crate) data: Cursor<Vec<u8>>, /*,
umis: Vec<u64>,
ref_offsets: Vec<u32>,
ref_ids: Vec<u32>,
*/
}

pub struct ChunkConfig {
pub num_chunks: u64,
pub bc_type: u8,
pub umi_type: u8,
}

impl<R: MappedRecord> Chunk<R> {
/// Read the header of the next [Chunk] from the provided `reader`. This
/// function returns a tuple representing the number of bytes and number of
/// records, respectively, in the chunk.
#[inline]
pub fn read_header<T: Read>(reader: &mut T) -> (u32, u32) {
let mut buf = [0u8; 8];
reader.read_exact(&mut buf).unwrap();
let nbytes = buf.pread::<u32>(0).unwrap();
let nrec = buf.pread::<u32>(4).unwrap();
(nbytes, nrec)
}

/// Read the next [Chunk] from the provided reader and return it.
#[inline]
pub fn from_bytes<T: Read>(reader: &mut T, ctx: &R::ParsingContext) -> Self {
let (nbytes, nrec) = Self::read_header(reader);
let mut c = Self {
nbytes,
nrec,
reads: Vec::<R>::with_capacity(nrec as usize),
};

for _ in 0..(nrec as usize) {
c.reads.push(R::from_bytes_with_context(reader, ctx));
}

c
}

/// Peeks to the first [libradicl::record::AlevinFryReadRecord] in the buffer `buf`, and returns
/// the barcode and umi associated with this record. It is assumed
/// that there is at least one [libradicl::record::AlevinFryReadRecord] present in the buffer.
#[inline]
pub fn peek_record(buf: &[u8], ctx: &R::ParsingContext) -> R::PeekResult {
R::peek_record(buf, ctx)
}
}
1 change: 1 addition & 0 deletions src/constants.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub(crate) const MAX_REF_NAME_LEN: usize = 65536;
176 changes: 176 additions & 0 deletions src/header.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
use crate::{self as libradicl, constants};
use libradicl::rad_types::{TagSection, TagSectionLabel};
use libradicl::record::RecordContext;
use noodles_sam as sam;
use scroll::Pread;
use std::io::Read;

/// The [RadPrelude] groups together the [RadHeader]
/// as well as the relevant top-level [TagSection]s of the file.
/// It constitutes everything in the initial file prior to the
/// start of the first [libradicl::chunk::Chunk].
pub struct RadPrelude {
pub hdr: RadHeader,
pub file_tags: TagSection,
pub read_tags: TagSection,
pub aln_tags: TagSection,
}

/// The [RadHeader] contains the relevant information about the
/// references against which the reads in this file were mapped and
/// information about the way in which mapping was performed.
pub struct RadHeader {
pub is_paired: u8,
pub ref_count: u64,
pub ref_names: Vec<String>,
pub num_chunks: u64,
}

impl Default for RadHeader {
fn default() -> Self {
Self::new()
}
}

impl RadHeader {
/// Create a new empty [RadHeader]
pub fn new() -> Self {
Self {
is_paired: 0,
ref_count: 0,
ref_names: vec![],
num_chunks: 0,
}
}

/// Create and return a new [RadHeader] by reading the contents of the
/// `reader`. If the reader is positioned such that a valid [RadHeader] comes
/// next, then this function returns [Ok(RadHeader)], otherwise, it returns
/// an [anyhow::Error] explaining the failure to parse the [RadHeader].
pub fn from_bytes<T: Read>(reader: &mut T) -> anyhow::Result<RadHeader> {
let mut rh = RadHeader::new();

// size of the longest allowable string.
let mut buf = [0u8; constants::MAX_REF_NAME_LEN];
reader.read_exact(&mut buf[0..9])?;
rh.is_paired = buf.pread(0)?;
rh.ref_count = buf.pread::<u64>(1)?;

// we know how many names we will read in.
rh.ref_names.reserve_exact(rh.ref_count as usize);

let mut num_read = 0u64;
while num_read < rh.ref_count {
// the length of the string
reader.read_exact(&mut buf[0..2])?;
let l: usize = buf.pread::<u16>(0)? as usize;
// the string itself
reader.read_exact(&mut buf[0..l])?;
rh.ref_names
.push(std::str::from_utf8(&buf[0..l])?.to_string());
num_read += 1;
}

reader.read_exact(&mut buf[0..8])?;
rh.num_chunks = buf.pread::<u64>(0)?;
Ok(rh)
}

/// Create and return a [RadHeader] from the provided BAM/SAM header
/// (represented by the noodles [sam::Header] `header`).
/// **Note**: The returned [RadHeader] will *not* have a value for the `num_chunks`
/// field, which will remain set at 0, nor will it set a meaningful value for the
/// `is_paried` flag, since the SAM/BAM header itself doesn't encode this information.
pub fn from_bam_header(header: &sam::Header) -> RadHeader {
let mut rh = RadHeader {
is_paired: 0,
ref_count: 0,
ref_names: vec![],
num_chunks: 0,
};

let ref_seqs = header.reference_sequences();
rh.ref_count = ref_seqs.len() as u64;
// we know how many names we will read in.
rh.ref_names.reserve_exact(rh.ref_count as usize);
for (k, _v) in ref_seqs.iter() {
rh.ref_names.push(k.to_string());
}
rh
}

/// Returns the size, in bytes, that this [RadHeader] will take
/// if written to an output stream.
pub fn get_size(&self) -> usize {
let mut tot_size = 0usize;
tot_size += std::mem::size_of_val(&self.is_paired) + std::mem::size_of_val(&self.ref_count);
// each name takes 2 bytes for the length, plus the actual
// number of bytes required by the string itself.
for t in self.ref_names.iter().map(|a| a.len()) {
tot_size += std::mem::size_of::<u16>() + t;
}
tot_size += std::mem::size_of_val(&self.num_chunks);
tot_size
}

pub fn summary(&self, num_refs: Option<usize>) -> anyhow::Result<String> {
use std::fmt::Write as _;
let mut s = String::new();
writeln!(&mut s, "RadHeader {{")?;
writeln!(&mut s, "is_paired: {}", self.is_paired)?;
writeln!(&mut s, "ref_count: {}", self.ref_count)?;

let refs_to_print = match num_refs {
Some(rcount) => rcount.min(self.ref_count as usize),
None => (self.ref_count as usize).min(10_usize),
};

for rn in self.ref_names.iter().take(refs_to_print) {
writeln!(&mut s, " ref: {}", rn)?;
}
writeln!(&mut s, " ...")?;

writeln!(&mut s, "num_chunks: {}", self.num_chunks)?;
writeln!(&mut s, "}}")?;
Ok(s)
}
}

impl RadPrelude {
pub fn from_bytes<T: Read>(reader: &mut T) -> anyhow::Result<Self> {
let hdr = RadHeader::from_bytes(reader)?;
let file_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::FileTags)?;
let read_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::ReadTags)?;
let aln_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::AlignmentTags)?;

//let file_tag_vals = file_tags.parse_tags_from_bytes(reader)?;
//println!("file-level tag values: {:?}", file_tag_vals);

Ok(Self {
hdr,
file_tags,
read_tags,
aln_tags,
})
}

pub fn summary(&self, num_refs: Option<usize>) -> anyhow::Result<String> {
use std::fmt::Write as _;
let mut s = self.hdr.summary(num_refs)?;
writeln!(&mut s, "[[{:?}]]", self.file_tags)?;
writeln!(&mut s, "[[{:?}]]", self.read_tags)?;
writeln!(&mut s, "[[{:?}]]", self.aln_tags)?;
//writeln!(&mut s, "file-level tag values [{:?}]", self.file_tag_vals)?;
Ok(s)
}

/// Obtain a [RecordContext] for a record of type `R` from this prelude, by
/// using the associated [TagSection]s. **Note**: Since this function
/// constructs the resulting `R` itself, and doesn't take any `R` parameter,
/// then it must always be invoked with the proper
/// [turbofish](https://doc.rust-lang.org/1.30.0/book/2018-edition/appendix-02-operators.html?highlight=turbofish#non-operator-symbols)
/// notation.
pub fn get_record_context<R: RecordContext>(&self) -> anyhow::Result<R> {
R::get_context_from_tag_section(&self.file_tags, &self.read_tags, &self.aln_tags)
}
}
Loading

0 comments on commit 30db3a6

Please sign in to comment.