-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #21 from COMBINE-lab/develop
feat: major rework of libradicl
- Loading branch information
Showing
12 changed files
with
1,474 additions
and
387 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[package] | ||
name = "libradicl" | ||
version = "0.7.0" | ||
version = "0.8.0" | ||
authors = [ | ||
"Avi Srivastava <[email protected]>", | ||
"Hirak Sarkar <[email protected]>", | ||
|
@@ -15,7 +15,7 @@ readme = "README.md" | |
repository = "https://github.com/COMBINE-lab/alevin-fry" | ||
homepage = "https://github.com/COMBINE-lab/alevin-fry" | ||
documentation = "https://alevin-fry.readthedocs.io/en/latest/" | ||
include = ["src/*.rs", "/Cargo.toml", "/README.md", "/LICENSE"] | ||
include = ["src/*.rs", "examples/*.rs", "/Cargo.toml", "/README.md", "/LICENSE"] | ||
keywords = [ | ||
"single-cell", | ||
"preprocessing", | ||
|
@@ -29,10 +29,14 @@ categories = ["command-line-utilities", "science"] | |
snap = "1" | ||
scroll = "0.12.0" | ||
num = "0.4.1" | ||
ahash = "0.8.8" | ||
serde = { version = "1.0.196", features = ["derive"] } | ||
ahash = "0.8.9" | ||
serde = { version = "1.0.197", features = ["derive"] } | ||
dashmap = "^5.5.3" | ||
bio-types = "1.0.1" | ||
smallvec = "1.13.1" | ||
noodles-bam = "0.56.0" | ||
noodles-sam = "0.53.0" | ||
anyhow = "1.0.80" | ||
|
||
[dev-dependencies] | ||
needletail="0.5.1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
use anyhow; | ||
use libradicl; | ||
use libradicl::chunk::Chunk; | ||
use libradicl::record::{PiscemBulkReadRecord, PiscemBulkRecordContext}; | ||
use std::io::BufReader; | ||
|
||
fn main() -> anyhow::Result<()> { | ||
let fname = std::env::args().nth(1).expect("input filename"); | ||
let f = std::fs::File::open(&fname)?; | ||
let mut ifile = BufReader::new(f); | ||
let p = libradicl::header::RadPrelude::from_bytes(&mut ifile)?; | ||
if let Ok(summary) = p.summary(None) { | ||
println!("{}", summary); | ||
} | ||
|
||
let _tag_map = p.file_tags.parse_tags_from_bytes_checked(&mut ifile)?; | ||
|
||
// Any extra context we may need to parse the records. In this case, it's the | ||
// size of the barcode and the umi. | ||
let tag_context = p.get_record_context::<PiscemBulkRecordContext>()?; | ||
let first_chunk = Chunk::<PiscemBulkReadRecord>::from_bytes(&mut ifile, &tag_context); | ||
println!( | ||
"Chunk :: nbytes: {}, nrecs: {}", | ||
first_chunk.nbytes, first_chunk.nrec | ||
); | ||
assert_eq!(first_chunk.nrec as usize, first_chunk.reads.len()); | ||
for (i, r) in first_chunk.reads.iter().take(10).enumerate() { | ||
println!("record {i}: {:?}", r); | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
use anyhow; | ||
use libradicl::chunk::Chunk; | ||
use libradicl::header; | ||
use libradicl::record::{AlevinFryReadRecord, AlevinFryRecordContext}; | ||
use std::io::BufReader; | ||
|
||
fn main() -> anyhow::Result<()> { | ||
let fname = std::env::args().nth(1).expect("input filename"); | ||
let f = std::fs::File::open(&fname)?; | ||
let mut ifile = BufReader::new(f); | ||
let p = header::RadPrelude::from_bytes(&mut ifile)?; | ||
if let Ok(summary) = p.summary(None) { | ||
println!("{}", summary); | ||
} | ||
|
||
let tag_map = p.file_tags.parse_tags_from_bytes_checked(&mut ifile)?; | ||
println!("tag map {:?}\n", tag_map); | ||
// Any extra context we may need to parse the records. In this case, it's the | ||
// size of the barcode and the umi. | ||
let tag_context = p.get_record_context::<AlevinFryRecordContext>()?; | ||
let first_chunk = Chunk::<AlevinFryReadRecord>::from_bytes(&mut ifile, &tag_context); | ||
println!( | ||
"Chunk :: nbytes: {}, nrecs: {}", | ||
first_chunk.nbytes, first_chunk.nrec | ||
); | ||
assert_eq!(first_chunk.nrec as usize, first_chunk.reads.len()); | ||
for (i, r) in first_chunk.reads.iter().take(10).enumerate() { | ||
println!("record {i}: {:?}", r); | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
use anyhow; | ||
use libradicl::{self, rad_types}; | ||
use std::io::BufReader; | ||
|
||
fn main() -> anyhow::Result<()> { | ||
let fname = std::env::args().nth(1).expect("input filename"); | ||
/* | ||
{ | ||
let f = std::fs::File::open(&fname)?; | ||
let mut ifile = BufReader::new(f); | ||
let h = rad_types::RadHeader::from_bytes(&mut ifile)?; | ||
if let Ok(summary) = h.summary(None) { | ||
println!("{}", summary); | ||
} | ||
let ts = rad_types::TagSection::from_bytes(&mut ifile)?; | ||
println!("File-level tags: {ts:?}"); | ||
let ts = rad_types::TagSection::from_bytes(&mut ifile)?; | ||
println!("Read-level tags: {ts:?}"); | ||
let ts = rad_types::TagSection::from_bytes(&mut ifile)?; | ||
println!("Alignment-level tags: {ts:?}"); | ||
} | ||
*/ | ||
|
||
println!("\n"); | ||
|
||
{ | ||
let f = std::fs::File::open(&fname)?; | ||
let mut ifile = BufReader::new(f); | ||
let p = libradicl::header::RadPrelude::from_bytes(&mut ifile)?; | ||
if let Ok(summary) = p.summary(None) { | ||
println!("{}", summary); | ||
} | ||
let ftmp = p.file_tags.parse_tags_from_bytes(&mut ifile)?; | ||
if let Some(rad_types::TagValue::ArrayU32(v)) = ftmp.get("ref_lengths") { | ||
println!( | ||
"file tag values: {:?}", | ||
v.iter().take(10).collect::<Vec<&u32>>() | ||
); | ||
} else { | ||
println!("file-level tags: {:?}", &ftmp); | ||
} | ||
} | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
use crate::{self as libradicl}; | ||
use libradicl::record::MappedRecord; | ||
use scroll::Pread; | ||
use std::io::{Cursor, Read}; | ||
|
||
#[derive(Debug)] | ||
pub struct Chunk<T: MappedRecord> { | ||
pub nbytes: u32, | ||
pub nrec: u32, | ||
pub reads: Vec<T>, | ||
} | ||
|
||
#[derive(Debug)] | ||
#[allow(dead_code)] | ||
pub struct CorrectedCbChunk { | ||
pub(crate) remaining_records: u32, | ||
pub(crate) corrected_bc: u64, | ||
pub(crate) nrec: u32, | ||
pub(crate) data: Cursor<Vec<u8>>, /*, | ||
umis: Vec<u64>, | ||
ref_offsets: Vec<u32>, | ||
ref_ids: Vec<u32>, | ||
*/ | ||
} | ||
|
||
pub struct ChunkConfig { | ||
pub num_chunks: u64, | ||
pub bc_type: u8, | ||
pub umi_type: u8, | ||
} | ||
|
||
impl<R: MappedRecord> Chunk<R> { | ||
/// Read the header of the next [Chunk] from the provided `reader`. This | ||
/// function returns a tuple representing the number of bytes and number of | ||
/// records, respectively, in the chunk. | ||
#[inline] | ||
pub fn read_header<T: Read>(reader: &mut T) -> (u32, u32) { | ||
let mut buf = [0u8; 8]; | ||
reader.read_exact(&mut buf).unwrap(); | ||
let nbytes = buf.pread::<u32>(0).unwrap(); | ||
let nrec = buf.pread::<u32>(4).unwrap(); | ||
(nbytes, nrec) | ||
} | ||
|
||
/// Read the next [Chunk] from the provided reader and return it. | ||
#[inline] | ||
pub fn from_bytes<T: Read>(reader: &mut T, ctx: &R::ParsingContext) -> Self { | ||
let (nbytes, nrec) = Self::read_header(reader); | ||
let mut c = Self { | ||
nbytes, | ||
nrec, | ||
reads: Vec::<R>::with_capacity(nrec as usize), | ||
}; | ||
|
||
for _ in 0..(nrec as usize) { | ||
c.reads.push(R::from_bytes_with_context(reader, ctx)); | ||
} | ||
|
||
c | ||
} | ||
|
||
/// Peeks to the first [libradicl::record::AlevinFryReadRecord] in the buffer `buf`, and returns | ||
/// the barcode and umi associated with this record. It is assumed | ||
/// that there is at least one [libradicl::record::AlevinFryReadRecord] present in the buffer. | ||
#[inline] | ||
pub fn peek_record(buf: &[u8], ctx: &R::ParsingContext) -> R::PeekResult { | ||
R::peek_record(buf, ctx) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pub(crate) const MAX_REF_NAME_LEN: usize = 65536; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
use crate::{self as libradicl, constants}; | ||
use libradicl::rad_types::{TagSection, TagSectionLabel}; | ||
use libradicl::record::RecordContext; | ||
use noodles_sam as sam; | ||
use scroll::Pread; | ||
use std::io::Read; | ||
|
||
/// The [RadPrelude] groups together the [RadHeader] | ||
/// as well as the relevant top-level [TagSection]s of the file. | ||
/// It constitutes everything in the initial file prior to the | ||
/// start of the first [libradicl::chunk::Chunk]. | ||
pub struct RadPrelude { | ||
pub hdr: RadHeader, | ||
pub file_tags: TagSection, | ||
pub read_tags: TagSection, | ||
pub aln_tags: TagSection, | ||
} | ||
|
||
/// The [RadHeader] contains the relevant information about the | ||
/// references against which the reads in this file were mapped and | ||
/// information about the way in which mapping was performed. | ||
pub struct RadHeader { | ||
pub is_paired: u8, | ||
pub ref_count: u64, | ||
pub ref_names: Vec<String>, | ||
pub num_chunks: u64, | ||
} | ||
|
||
impl Default for RadHeader { | ||
fn default() -> Self { | ||
Self::new() | ||
} | ||
} | ||
|
||
impl RadHeader { | ||
/// Create a new empty [RadHeader] | ||
pub fn new() -> Self { | ||
Self { | ||
is_paired: 0, | ||
ref_count: 0, | ||
ref_names: vec![], | ||
num_chunks: 0, | ||
} | ||
} | ||
|
||
/// Create and return a new [RadHeader] by reading the contents of the | ||
/// `reader`. If the reader is positioned such that a valid [RadHeader] comes | ||
/// next, then this function returns [Ok(RadHeader)], otherwise, it returns | ||
/// an [anyhow::Error] explaining the failure to parse the [RadHeader]. | ||
pub fn from_bytes<T: Read>(reader: &mut T) -> anyhow::Result<RadHeader> { | ||
let mut rh = RadHeader::new(); | ||
|
||
// size of the longest allowable string. | ||
let mut buf = [0u8; constants::MAX_REF_NAME_LEN]; | ||
reader.read_exact(&mut buf[0..9])?; | ||
rh.is_paired = buf.pread(0)?; | ||
rh.ref_count = buf.pread::<u64>(1)?; | ||
|
||
// we know how many names we will read in. | ||
rh.ref_names.reserve_exact(rh.ref_count as usize); | ||
|
||
let mut num_read = 0u64; | ||
while num_read < rh.ref_count { | ||
// the length of the string | ||
reader.read_exact(&mut buf[0..2])?; | ||
let l: usize = buf.pread::<u16>(0)? as usize; | ||
// the string itself | ||
reader.read_exact(&mut buf[0..l])?; | ||
rh.ref_names | ||
.push(std::str::from_utf8(&buf[0..l])?.to_string()); | ||
num_read += 1; | ||
} | ||
|
||
reader.read_exact(&mut buf[0..8])?; | ||
rh.num_chunks = buf.pread::<u64>(0)?; | ||
Ok(rh) | ||
} | ||
|
||
/// Create and return a [RadHeader] from the provided BAM/SAM header | ||
/// (represented by the noodles [sam::Header] `header`). | ||
/// **Note**: The returned [RadHeader] will *not* have a value for the `num_chunks` | ||
/// field, which will remain set at 0, nor will it set a meaningful value for the | ||
/// `is_paried` flag, since the SAM/BAM header itself doesn't encode this information. | ||
pub fn from_bam_header(header: &sam::Header) -> RadHeader { | ||
let mut rh = RadHeader { | ||
is_paired: 0, | ||
ref_count: 0, | ||
ref_names: vec![], | ||
num_chunks: 0, | ||
}; | ||
|
||
let ref_seqs = header.reference_sequences(); | ||
rh.ref_count = ref_seqs.len() as u64; | ||
// we know how many names we will read in. | ||
rh.ref_names.reserve_exact(rh.ref_count as usize); | ||
for (k, _v) in ref_seqs.iter() { | ||
rh.ref_names.push(k.to_string()); | ||
} | ||
rh | ||
} | ||
|
||
/// Returns the size, in bytes, that this [RadHeader] will take | ||
/// if written to an output stream. | ||
pub fn get_size(&self) -> usize { | ||
let mut tot_size = 0usize; | ||
tot_size += std::mem::size_of_val(&self.is_paired) + std::mem::size_of_val(&self.ref_count); | ||
// each name takes 2 bytes for the length, plus the actual | ||
// number of bytes required by the string itself. | ||
for t in self.ref_names.iter().map(|a| a.len()) { | ||
tot_size += std::mem::size_of::<u16>() + t; | ||
} | ||
tot_size += std::mem::size_of_val(&self.num_chunks); | ||
tot_size | ||
} | ||
|
||
pub fn summary(&self, num_refs: Option<usize>) -> anyhow::Result<String> { | ||
use std::fmt::Write as _; | ||
let mut s = String::new(); | ||
writeln!(&mut s, "RadHeader {{")?; | ||
writeln!(&mut s, "is_paired: {}", self.is_paired)?; | ||
writeln!(&mut s, "ref_count: {}", self.ref_count)?; | ||
|
||
let refs_to_print = match num_refs { | ||
Some(rcount) => rcount.min(self.ref_count as usize), | ||
None => (self.ref_count as usize).min(10_usize), | ||
}; | ||
|
||
for rn in self.ref_names.iter().take(refs_to_print) { | ||
writeln!(&mut s, " ref: {}", rn)?; | ||
} | ||
writeln!(&mut s, " ...")?; | ||
|
||
writeln!(&mut s, "num_chunks: {}", self.num_chunks)?; | ||
writeln!(&mut s, "}}")?; | ||
Ok(s) | ||
} | ||
} | ||
|
||
impl RadPrelude { | ||
pub fn from_bytes<T: Read>(reader: &mut T) -> anyhow::Result<Self> { | ||
let hdr = RadHeader::from_bytes(reader)?; | ||
let file_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::FileTags)?; | ||
let read_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::ReadTags)?; | ||
let aln_tags = TagSection::from_bytes_with_label(reader, TagSectionLabel::AlignmentTags)?; | ||
|
||
//let file_tag_vals = file_tags.parse_tags_from_bytes(reader)?; | ||
//println!("file-level tag values: {:?}", file_tag_vals); | ||
|
||
Ok(Self { | ||
hdr, | ||
file_tags, | ||
read_tags, | ||
aln_tags, | ||
}) | ||
} | ||
|
||
pub fn summary(&self, num_refs: Option<usize>) -> anyhow::Result<String> { | ||
use std::fmt::Write as _; | ||
let mut s = self.hdr.summary(num_refs)?; | ||
writeln!(&mut s, "[[{:?}]]", self.file_tags)?; | ||
writeln!(&mut s, "[[{:?}]]", self.read_tags)?; | ||
writeln!(&mut s, "[[{:?}]]", self.aln_tags)?; | ||
//writeln!(&mut s, "file-level tag values [{:?}]", self.file_tag_vals)?; | ||
Ok(s) | ||
} | ||
|
||
/// Obtain a [RecordContext] for a record of type `R` from this prelude, by | ||
/// using the associated [TagSection]s. **Note**: Since this function | ||
/// constructs the resulting `R` itself, and doesn't take any `R` parameter, | ||
/// then it must always be invoked with the proper | ||
/// [turbofish](https://doc.rust-lang.org/1.30.0/book/2018-edition/appendix-02-operators.html?highlight=turbofish#non-operator-symbols) | ||
/// notation. | ||
pub fn get_record_context<R: RecordContext>(&self) -> anyhow::Result<R> { | ||
R::get_context_from_tag_section(&self.file_tags, &self.read_tags, &self.aln_tags) | ||
} | ||
} |
Oops, something went wrong.