diff --git a/crates/binjs_io/src/context/huffman.rs b/crates/binjs_io/src/context/huffman.rs deleted file mode 100644 index 62cfaf4f6..000000000 --- a/crates/binjs_io/src/context/huffman.rs +++ /dev/null @@ -1,289 +0,0 @@ -use io::statistics::Instances; - -use std::cmp::Ordering; -use std::collections::{BinaryHeap, HashMap}; -use std::hash::Hash; - -/// A newtype for `u8` used to count the length of a key in bits. -#[derive( - Debug, - Default, - Display, - Serialize, - Deserialize, - From, - Into, - Add, - AddAssign, - Sub, - SubAssign, - Clone, - Copy, - PartialOrd, - Ord, - PartialEq, - Eq, -)] -pub struct BitLen(u8); - -/// Convenience implementation of operator `<<` in -/// `bits << bit_len` -impl std::ops::Shl for u32 { - type Output = u32; - fn shl(self, rhs: BitLen) -> u32 { - self << Into::::into(rhs) - } -} - -/// The largerst acceptable length for a key. -/// -/// Hardcoded in the format. -const MAX_CODE_BIT_LENGTH: u8 = 20; - -/// A Huffman key -#[derive(Debug)] -struct Key { - /// The bits in the key. - /// - /// Note that we only use the `bit_len` lowest-weight bits. - /// Any other bit is ignored. - bits: u32, - - /// The number of bits of `bits` to use. - bit_len: BitLen, -} - -/// A node in the Huffman tree. -struct Node { - /// The total number of instances of all `NodeContent::Leaf(T)` in this subtree. - instances: Instances, - - /// The content of the node. - content: NodeContent, -} - -/// Contents of a node in the Huffman tree. -enum NodeContent { - /// A value from the stream of values. - Leaf(T), - - /// An internal node obtained by joining two subtrees. - Internal { - left: Box>, - right: Box>, - }, -} - -/// Custom ordering of `NodeContent`. -/// -/// We compare *only* by number of instances. -impl PartialOrd for Node { - fn partial_cmp(&self, other: &Self) -> Option { - self.instances.partial_cmp(&other.instances) - } -} -impl Ord for Node { - fn cmp(&self, other: &Self) -> Ordering { - self.instances.cmp(&other.instances) - } -} -impl PartialEq for Node { - fn eq(&self, other: &Self) -> bool { - self.instances.eq(&other.instances) - } -} -impl Eq for Node {} - -/// Keys associated to a sequence of values. -#[derive(Debug)] -pub struct Keys -where - T: Ord + Clone, -{ - /// The sequence of keys. - /// - /// Order is meaningful. - keys: Vec<(T, Key)>, -} - -impl Keys -where - T: Ord + Clone, -{ - /// Compute a `Keys` from a sequence of values. - /// - /// Optionally, `max_bit_len` may specify a largest acceptable bit length. - /// If `Keys` may not be computed without exceeding this bit length, - /// fail with `Err(problemantic_bit_length)`. - /// - /// The current implementation only attempts to produce the best compression - /// level. This may cause us to exceed `max_bit_length` even though an - /// alternative table, with a lower compression level, would let us - /// proceed without exceeding `max_bit_length`. - /// - /// # Performance - /// - /// Values (type `T`) will be cloned regularly, so you should make - /// sure that their cloning is reasonably cheap. - pub fn from_sequence(source: S, max_bit_len: u8) -> Result - where - S: IntoIterator, - T: PartialEq + Hash, - { - // Count the values. - let mut map = HashMap::new(); - for item in source { - let counter = map.entry(item).or_insert(0.into()); - *counter += 1.into(); - } - // Then compute the `Keys`. - Self::from_instances(map, max_bit_len) - } - - /// Compute a `Keys` from a sequence of values - /// with a number of instances already attached. - /// - /// The current implementation only attempts to produce the best compression - /// level. This may cause us to exceed `max_bit_length` even though an - /// alternative table, with a lower compression level, would let us - /// proceed without exceeding `max_bit_length`. - /// - /// # Requirement - /// - /// Values of `T` in the source MUST be distinct. - pub fn from_instances(source: S, max_bit_len: u8) -> Result - where - S: IntoIterator, - { - let mut bit_lengths = Self::compute_bit_lengths(source, max_bit_len)?; - - // Canonicalize order: (BitLen, T) - // As values of `T` are - bit_lengths.sort_unstable_by_key(|&(ref value, ref bit_len)| (*bit_len, value.clone())); - - // The bits associated to the next value. - let mut bits = 0; - let mut keys = Vec::with_capacity(bit_lengths.len()); - - for i in 0..bit_lengths.len() - 1 { - let (bit_len, symbol, next_bit_len) = ( - bit_lengths[i].1, - bit_lengths[i].0.clone(), - bit_lengths[i + 1].1, - ); - keys.push((symbol.clone(), Key { bits, bit_len })); - bits = (bits + 1) << (next_bit_len - bit_len); - } - // Handle the last element. - let (ref symbol, bit_len) = bit_lengths[bit_lengths.len() - 1]; - keys.push((symbol.clone(), Key { bits, bit_len })); - - return Ok(Self { keys }); - } - - /// Convert a sequence of values labelled by their number of instances - /// into a sequence of values labelled by the length for their path - /// in the Huffman tree, aka the bitlength of their Huffman key. - /// - /// Values that have 0 instances are skipped. - pub fn compute_bit_lengths(source: S, max_bit_len: u8) -> Result, u8> - where - S: IntoIterator, - { - // Build a min-heap sorted by number of instances. - use std::cmp::Reverse; - let mut heap = BinaryHeap::new(); - - // Skip values that have 0 instances. - for (value, instances) in source { - if !instances.is_zero() { - heap.push(Reverse(Node { - instances, - content: NodeContent::Leaf(value), - })); - } - } - - let len = heap.len(); - if len == 0 { - // Special case: no tree to build. - return Ok(vec![]); - } - - // Take the two rarest nodes, merge them behind a prefix, - // turn them into a single node with combined number of - // instances. Repeat. - while heap.len() > 1 { - let left = heap.pop().unwrap(); - let right = heap.pop().unwrap(); - heap.push(Reverse(Node { - instances: left.0.instances + right.0.instances, - content: NodeContent::Internal { - left: Box::new(left.0.content), - right: Box::new(right.0.content), - }, - })); - } - - // Convert tree into bit lengths - let root = heap.pop().unwrap(); // We have checked above that there is at least one value. - let mut bit_lengths = Vec::with_capacity(len); - fn aux( - bit_lengths: &mut Vec<(T, BitLen)>, - max_bit_len: u8, - depth: u8, - node: &NodeContent, - ) -> Result<(), u8> - where - T: Clone, - { - match *node { - NodeContent::Leaf(ref value) => { - if depth > max_bit_len { - return Err(depth); - } - bit_lengths.push((value.clone(), BitLen(depth))); - Ok(()) - } - NodeContent::Internal { - ref left, - ref right, - } => { - aux(bit_lengths, max_bit_len, depth + 1, left)?; - aux(bit_lengths, max_bit_len, depth + 1, right)?; - Ok(()) - } - } - } - aux(&mut bit_lengths, max_bit_len, 0, &root.0.content)?; - - Ok(bit_lengths) - } -} - -#[test] -fn test_coded_from_sequence() { - let sample = "appl"; - let coded = Keys::from_sequence(sample.chars(), std::u8::MAX).unwrap(); - - // Symbol 'p' appears twice, we should see 3 codes. - assert_eq!(coded.keys.len(), 3); - - // Check order of symbols. - assert_eq!(coded.keys[0].0, 'p'); - assert_eq!(coded.keys[1].0, 'a'); - assert_eq!(coded.keys[2].0, 'l'); - - // Check bit length of symbols. - assert_eq!(coded.keys[0].1.bit_len, 1.into()); - assert_eq!(coded.keys[1].1.bit_len, 2.into()); - assert_eq!(coded.keys[2].1.bit_len, 2.into()); - - // Check code of symbols. - assert_eq!(coded.keys[0].1.bits, 0b00); - assert_eq!(coded.keys[1].1.bits, 0b10); - assert_eq!(coded.keys[2].1.bits, 0b11); - - // Let's try again with a limit to 1 bit paths. - assert_eq!(Keys::from_sequence(sample.chars(), 1).unwrap_err(), 2); -} diff --git a/crates/binjs_io/src/context/huffman/codebook.rs b/crates/binjs_io/src/context/huffman/codebook.rs new file mode 100644 index 000000000..5b588d30c --- /dev/null +++ b/crates/binjs_io/src/context/huffman/codebook.rs @@ -0,0 +1,622 @@ +use context::huffman::*; +use context::varnum::{ReadVaru32, WriteVaru32}; + +use std::io::{self, Read, Write}; + +const TABLE_HEADER_UNIT: u8 = 0; +const TABLE_HEADER_MULTI: u8 = 1; +const TABLE_HEADER_EMPTY: u8 = 2; + +const VEC_MAX_PRE_ALLOC: usize = 1024; + +/// Codebook associated to a sequence of values. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Codebook { + /// The longest bit length that actually appears in `mappings`. + highest_bit_len: BitLen, + + /// The sequence of keys. + /// + /// Order is meaningful. + mappings: Vec<(T, Key)>, +} + +impl Codebook { + /// The number of elements in this Codebook. + pub fn len(&self) -> usize { + self.mappings.len() + } + + /// The longest bit length that acctually appears in this Codebook. + pub fn highest_bit_len(&self) -> BitLen { + self.highest_bit_len + } +} + +impl IntoIterator for Codebook { + type Item = (T, Key); + type IntoIter = std::vec::IntoIter<(T, Key)>; + fn into_iter(self) -> Self::IntoIter { + self.mappings.into_iter() + } +} + +impl Codebook +where + T: Ord + Clone, +{ + /// Compute a `Codebook` from a sequence of values. + /// + /// Optionally, `max_bit_len` may specify a largest acceptable bit length. + /// If the `Codebook` may not be computed without exceeding this bit length, + /// fail with `Err(problemantic_bit_len)`. + /// + /// The current implementation only attempts to produce the best compression + /// level. This may cause us to exceed `max_bit_len` even though an + /// alternative table, with a lower compression level, would let us + /// proceed without exceeding `max_bit_len`. + /// + /// # Performance + /// + /// Values (type `T`) will be cloned regularly, so you should make + /// sure that their cloning is reasonably cheap. + pub fn from_sequence(source: S, max_bit_len: BitLen) -> Result + where + S: IntoIterator, + T: PartialEq + Hash, + { + // Count the values. + let mut map = HashMap::new(); + for item in source { + let counter = map.entry(item).or_insert(0.into()); + *counter += 1.into(); + } + // Then compute the `Codebook`. + Self::from_instances(map, max_bit_len) + } + + /// Compute a `Codebook` from a sequence of values + /// with a number of instances already attached. + /// + /// The current implementation only attempts to produce the best compression + /// level. This may cause us to exceed `max_bit_len` even though an + /// alternative table, with a lower compression level, would let us + /// proceed without exceeding `max_bit_len`. + /// + /// # Requirement + /// + /// Values of `T` in the source MUST be distinct. + pub fn from_instances(source: S, max_bit_len: BitLen) -> Result + where + S: IntoIterator, + { + let bit_lengths = Self::compute_bit_lengths(source, max_bit_len)?; + Self::from_bit_lens(bit_lengths, max_bit_len) + } + + /// Compute a `Codebook` from a sequence of values + /// with a bit length already attached. + /// + /// The current implementation only attempts to produce the best compression + /// level. This may cause us to exceed `max_bit_len` even though an + /// alternative table, with a lower compression level, would let us + /// proceed without exceeding `max_bit_len`. + /// + /// # Requirement + /// + /// Values of `T` in the source MUST be distinct. + pub fn from_bit_lens( + mut bit_lens: Vec<(T, BitLen)>, + max_bit_len: BitLen, + ) -> Result { + let mut highest_bit_len = BitLen(0); + + // Canonicalize order: (BitLen, T) + bit_lens.sort_unstable_by_key(|&(ref value, ref bit_len)| (*bit_len, value.clone())); + + // The bits associated to the next value. + let mut bits = 0; + let mut mappings = Vec::with_capacity(bit_lens.len()); + + for i in 0..bit_lens.len() - 1 { + let (bit_len, symbol, next_bit_len) = + (bit_lens[i].1, bit_lens[i].0.clone(), bit_lens[i + 1].1); + mappings.push((symbol.clone(), Key::try_new(bits, bit_len)?)); + bits = (bits + 1) << (next_bit_len - bit_len); + if bit_len > highest_bit_len { + highest_bit_len = bit_len; + } + } + // Handle the last element. + let (ref symbol, bit_len) = bit_lens[bit_lens.len() - 1]; + if bit_len > highest_bit_len { + highest_bit_len = bit_len; + } + mappings.push((symbol.clone(), Key::new(bits, bit_len))); + + if highest_bit_len > max_bit_len { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Could not create a codebook that fits into this bit length", + )); + } + + return Ok(Self { + highest_bit_len, + mappings, + }); + } + + /// Convert a sequence of values labelled by their number of instances + /// into a sequence of values labelled by the length for their path + /// in the Huffman tree, aka the bitlength of their Huffman key. + /// + /// Values that have 0 instances are skipped. + pub fn compute_bit_lengths( + source: S, + max_bit_len: BitLen, + ) -> Result, std::io::Error> + where + S: IntoIterator, + { + // Build a min-heap sorted by number of instances. + use std::cmp::Reverse; + let mut heap = BinaryHeap::new(); + + // Skip values that have 0 instances. + for (value, instances) in source { + if !instances.is_zero() { + heap.push(Reverse(Node { + instances, + content: NodeContent::Leaf(value), + })); + } + } + + let len = heap.len(); + if len == 0 { + // Special case: no tree to build. + return Ok(vec![]); + } + + // Take the two rarest nodes, merge them behind a prefix, + // turn them into a single node with combined number of + // instances. Repeat. + while heap.len() > 1 { + let left = heap.pop().unwrap(); + let right = heap.pop().unwrap(); + heap.push(Reverse(Node { + instances: left.0.instances + right.0.instances, + content: NodeContent::Internal { + left: Box::new(left.0.content), + right: Box::new(right.0.content), + }, + })); + } + + // Convert tree into bit lengths + let root = heap.pop().unwrap(); // We have checked above that there is at least one value. + let mut bit_lengths = Vec::with_capacity(len); + fn aux( + bit_lengths: &mut Vec<(T, BitLen)>, + max_bit_len: BitLen, + depth: u8, + node: &NodeContent, + ) -> Result<(), std::io::Error> + where + T: Clone, + { + match *node { + NodeContent::Leaf(ref value) => { + if depth > max_bit_len.as_u8() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Could not create a codebook that fits into this bit length", + )); + } + bit_lengths.push((value.clone(), BitLen(depth))); + Ok(()) + } + NodeContent::Internal { + ref left, + ref right, + } => { + aux(bit_lengths, max_bit_len, depth + 1, left)?; + aux(bit_lengths, max_bit_len, depth + 1, right)?; + Ok(()) + } + } + } + aux(&mut bit_lengths, max_bit_len, 0, &root.0.content)?; + + Ok(bit_lengths) + } +} + +#[test] +fn test_coded_from_sequence() { + let sample = "appl"; + let try_make_codebook = |bit_len| Codebook::from_sequence(sample.chars(), bit_len); + let coded = try_make_codebook(BitLen::new(std::u8::MAX)).unwrap(); + + // Symbol 'p' appears twice, we should see 3 codes. + assert_eq!(coded.mappings.len(), 3); + + // Check order of symbols. + assert_eq!(coded.mappings[0].0, 'p'); + assert_eq!(coded.mappings[1].0, 'a'); + assert_eq!(coded.mappings[2].0, 'l'); + + // Check bit length of symbols. + assert_eq!(coded.mappings[0].1.bit_len(), 1.into()); + assert_eq!(coded.mappings[1].1.bit_len(), 2.into()); + assert_eq!(coded.mappings[2].1.bit_len(), 2.into()); + + // Check code of symbols. + assert_eq!(coded.mappings[0].1.bits(), 0b00); + assert_eq!(coded.mappings[1].1.bits(), 0b10); + assert_eq!(coded.mappings[2].1.bits(), 0b11); + + // Let's try again with a limit to 1 bit paths. + assert!(try_make_codebook(BitLen::new(1)).is_err()); +} + +impl Codebook { + /// Create an empty Codebook + pub fn new() -> Self { + Self { + highest_bit_len: BitLen::new(0), + mappings: vec![], + } + } + + /// Create an empty Codebook + pub fn with_capacity(len: usize) -> Self { + Self { + highest_bit_len: BitLen::new(0), + mappings: Vec::with_capacity(len), + } + } + + /// Add a mapping to a Codebook. + /// + /// This method does **not** check that the resulting Codebook is correct. + pub unsafe fn add_mapping(&mut self, value: T, key: Key) { + if key.bit_len() > self.highest_bit_len { + self.highest_bit_len = key.bit_len(); + } + self.mappings.push((value, key)); + } + + /// Return the mappings of a Codebook. + pub fn mappings(self) -> Vec<(T, Key)> { + self.mappings + } + + /// Iterate through this Codebook. + pub fn iter(&self) -> impl Iterator { + self.mappings.iter() + } +} + +/// Writing +impl Codebook +where + T: Ord + Clone + Hash, +{ + /// Write a Codebook for `StaticAlphabet`. + fn write_static(&self, mut out: W) -> Result<(), io::Error> + where + A: StaticAlphabet, + W: Write, + { + match self.len() { + 0 => { + /* spec: EmptyCodeTable */ + out.write_all(&[TABLE_HEADER_EMPTY])?; + Ok(()) + } + 1 => { + /* spec: UnitCodeTable */ + out.write_all(&[TABLE_HEADER_UNIT])?; + A::write_literal(&self.mappings[0].0, out)?; + Ok(()) + } + _ => { + /* spec: MultiCodeTableImplicit */ + out.write_all(&[TABLE_HEADER_MULTI])?; + let map: HashMap<_, _> = self.mappings.iter().cloned().collect(); + for i in 0..A::len() { + let symbol = A::symbol(i).unwrap(); // We're in 0..A::len() + let bit_len = map + .get(&symbol) + .map(|key| key.bit_len().clone()) + .unwrap_or(BitLen::new(0)); + out.write_all(&[bit_len.as_u8()])?; + } + Ok(()) + } + } + } + + /// Write a Codebook for `DynamicAlphabet`. + fn write_dynamic(&self, mut out: W) -> Result<(), io::Error> + where + A: DynamicAlphabet, + W: Write, + { + match self.len() { + 0 => { + /* spec: EmptyCodeTable */ + out.write_all(&[TABLE_HEADER_EMPTY])?; + Ok(()) + } + 1 => { + /* spec: UnitCodeTable */ + out.write_all(&[TABLE_HEADER_UNIT])?; + A::write_literal(&self.mappings[0].0, out)?; + Ok(()) + } + _ => { + /* spec: MultiCodeTableExplicit */ + + // First the header. + out.write_all(&[TABLE_HEADER_MULTI])?; + + // Now, the length. + out.write_varu32(self.len() as u32)?; + + // Then bit lengths. + for &(_, ref key) in &self.mappings { + out.write_all(&[key.bit_len().as_u8()])?; + } + self.mappings.len(); + + // Then symbols. + for &(ref symbol, _) in &self.mappings { + A::write_literal(symbol, &mut out)?; + } + + Ok(()) + } + } + } +} + +/// Reading +impl Codebook +where + T: Ord + Clone, +{ + /// Parse a Codebook containing a single symbol. + fn read_single_symbol(mut inp: R) -> Result + where + A: Alphabet, + R: Read, + { + let symbol = A::read_literal(&mut inp)?; + Codebook::from_bit_lens(vec![(symbol, BitLen::new(0))], MAX_CODE_BIT_LEN).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN", + ) + }) + } + + /// Parse a Codebook for `StaticAlphabet`. + pub fn read_static(mut inp: R) -> Result + where + A: StaticAlphabet, + R: Read, + { + let mut byte = [0]; + inp.read_exact(&mut byte)?; + match byte[0] { + 0 => + /* spec: UnitCodeTable */ + { + Self::read_single_symbol::(inp) + } + 1 => + /* spec: MultiCodeTableImplicit */ + { + let number_of_symbols = A::len(); + let mut bit_lens = + Vec::with_capacity(usize::min(number_of_symbols as usize, VEC_MAX_PRE_ALLOC)); + for i in 0..number_of_symbols { + // Read the bit length. + let mut byte = [0]; + inp.read_exact(&mut byte)?; + let bit_len = BitLen::new(byte[0]); + + if bit_len > BitLen::new(0) { + // Extract the symbol from the grammar. + let symbol = A::symbol(i).unwrap(); // We're within 0..A::len() + + bit_lens.push((symbol, bit_len)); + } + } + // Finally, build a codebook. + Codebook::from_bit_lens(bit_lens, MAX_CODE_BIT_LEN).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN", + ) + }) + } + 2 => + /* spec: EmptyCodeTable */ + { + Ok(Codebook::new()) + } + _ => Err(io::Error::new( + io::ErrorKind::InvalidData, + "Incorrect CodeTable kind", + )), + } + } + + /// Parse a Codebook for `DynamicAlphabet`. + pub fn read_dynamic(mut inp: R) -> Result + where + A: DynamicAlphabet, + R: Read, + T: Default, + { + let mut byte = [0]; + inp.read_exact(&mut byte)?; + match byte[0] { + 0 => + /* spec: UnitCodeTable */ + { + Self::read_single_symbol::(inp) + } + 1 => + /* spec: MultiCodeTableExplicit */ + { + let number_of_symbols = *inp.read_varu32_no_normalization()?.value(); + let mut bit_lens = + Vec::with_capacity(usize::min(number_of_symbols as usize, VEC_MAX_PRE_ALLOC)); + + // Read bit lengths. + for _ in 0..number_of_symbols { + let mut byte = [0]; + inp.read_exact(&mut byte)?; + bit_lens.push((T::default(), BitLen::new(byte[0]))); + } + + // Amend with symbols + for i in 0..number_of_symbols { + let symbol = A::read_literal(&mut inp)?; + bit_lens[i as usize].0 = symbol; + } + + // Finally, build a codebook. + Codebook::from_bit_lens(bit_lens, MAX_CODE_BIT_LEN).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN", + ) + }) + } + 2 => + /* spec: EmptyCodeTable */ + { + Ok(Codebook::new()) + } + _ => Err(io::Error::new( + io::ErrorKind::InvalidData, + "Incorrect CodeTable kind", + )), + } + } +} + +#[test] +fn read_and_write_codebook() { + use std::convert::TryInto; + use std::io; + + for sample in &[ + "appl", + "Lorem ipsum dolor sit amet consectetur adipiscing elit nunc, ridiculus hac natoque ante quisque imperdiet primis et euismod, pellentesque per turpis purus vestibulum quam dui. Himenaeos inceptos hac laoreet purus eros donec imperdiet, aliquam habitant felis class fusce etiam nulla facilisi, pretium eu nisl ultrices augue dictum. Venenatis mauris semper ultricies platea interdum sapien iaculis, habitasse eget habitant nec nam tincidunt, nulla aptent arcu duis laoreet volutpat. + +Torquent facilisi vestibulum erat eleifend diam convallis ac at, feugiat nullam vulputate euismod lacinia mollis quis venenatis, gravida porttitor cursus nascetur lacus per nostra. Platea ante curae netus torquent diam ultrices massa orci, vulputate sociis curabitur himenaeos litora sed aliquam nisi rutrum, cras porttitor per etiam iaculis eget arcu. Varius turpis libero metus luctus senectus condimentum cum mattis arcu, faucibus volutpat dapibus torquent ultrices fusce primis morbi, sed augue ridiculus magnis vitae placerat tempus curabitur. + +Aliquam habitant eu curae est eget orci auctor, non vehicula augue montes litora ac, class quis cum volutpat condimentum ullamcorper. Quisque consequat est vehicula volutpat at proin gravida sociosqu, nec dis ac ultricies phasellus viverra donec nullam, eros potenti facilisis mauris ad curabitur quis. Magna nisl ligula tellus conubia accumsan fringilla iaculis inceptos leo litora, eget integer malesuada rhoncus varius a tempor augue. Posuere nullam parturient eleifend quisque ornare vulputate curae ultricies iaculis est, odio scelerisque hendrerit non primis ut leo ante libero, nisi eu quam euismod habitant velit per lectus cubilia. + +Blandit quisque urna proin nostra praesent dui, magnis sollicitudin auctor ultrices platea sociis habitant, ut faucibus habitasse luctus elementum. Hendrerit elementum rutrum in erat nulla facilisi mauris torquent mus, diam consequat pulvinar tempor sociosqu conubia ornare ante, vehicula litora scelerisque magna placerat eleifend sapien risus. Pellentesque curabitur parturient per facilisi rhoncus porta posuere enim hendrerit, lacus litora aptent etiam vel id ante rutrum donec, platea gravida integer urna tristique est potenti class. Mus ante ut cursus in lacinia, sollicitudin posuere inceptos ullamcorper a, nam cras mi venenatis. + +Arcu magna lacus habitant eleifend cursus vitae, fermentum diam scelerisque nisi habitasse, conubia felis quis suscipit facilisis. Sociosqu erat lectus etiam aliquam quis vulputate praesent pharetra cras nam fermentum ultricies, nunc parturient fames imperdiet sem posuere molestie mi felis suscipit. Tortor etiam ligula leo nunc senectus sem pharetra, viverra suscipit egestas cum eu ullamcorper netus accumsan, eleifend porttitor sed lectus varius integer. Sem nascetur ligula ultrices risus eros nisl quisque, pulvinar lacinia sagittis magna primis odio dictum, metus a curabitur ante taciti inceptos. + +Mollis laoreet sollicitudin augue tortor facilisis cubilia molestie auctor erat sociis, condimentum parturient vestibulum lacinia urna potenti nascetur vehicula varius tempor mattis, velit maecenas tristique a habitant et porttitor tempus netus. Habitant interdum penatibus litora himenaeos dignissim torquent quam nulla, praesent elementum ad potenti accumsan class urna malesuada ut, aliquet aliquam egestas venenatis leo eu rhoncus pellentesque, augue ultricies posuere fames nullam aenean pretium. Tristique penatibus neque leo dignissim vulputate bibendum rhoncus pharetra, sem rutrum vehicula mauris lobortis proin platea, viverra metus natoque accumsan hendrerit posuere nunc. Aliquam nam porttitor leo tortor vel tempor nulla non, sollicitudin habitasse ornare magnis feugiat metus viverra quisque libero, risus eget enim orci torquent aptent molestie. + +Dignissim laoreet quis ligula non auctor id pellentesque justo, varius platea eget convallis dictum dui faucibus nec porttitor, porta praesent eu ante in rhoncus congue. Massa etiam eget vel torquent dis potenti accumsan ultrices, pulvinar et cursus cubilia maecenas diam himenaeos nunc blandit, semper vulputate turpis at scelerisque porttitor primis. Nam odio venenatis maecenas at tortor viverra metus, turpis suscipit ad facilisis elementum primis felis luctus, tempor curabitur suspendisse lobortis nunc ligula. + +Ante aliquet ultricies est lobortis a sollicitudin urna parturient eu, nec massa cursus mollis sagittis id risus accumsan condimentum, nisl platea habitant aenean eros leo fringilla blandit. Mi semper convallis posuere dictum integer torquent suspendisse, in rhoncus nulla himenaeos sociosqu cras praesent quam, nostra turpis scelerisque tempor facilisi velit. Mauris nec ut risus imperdiet varius venenatis quam ligula, luctus cursus velit scelerisque ullamcorper ultrices sociis viverra, vulputate lectus volutpat sodales nostra tincidunt suspendisse. Senectus fermentum bibendum a tristique sed sociosqu potenti, lectus ante egestas ac consequat donec eros, penatibus enim ridiculus luctus cursus malesuada. + +Dui dictum dignissim dis ultricies justo donec nisi, cum quisque rhoncus aliquam interdum iaculis dapibus, fringilla phasellus accumsan eget odio inceptos. Placerat laoreet iaculis nullam enim praesent diam semper porta montes, nisi commodo tempus rutrum nostra in himenaeos cum primis mollis, auctor congue venenatis a sed sollicitudin pulvinar ad. Duis faucibus penatibus mauris turpis tempus suscipit, litora habitasse ultricies potenti auctor, semper in ac placerat sollicitudin. + +Turpis at taciti lacus aenean cum, donec facilisi diam neque, pellentesque mattis sem auctor. Duis donec maecenas consequat nullam a fusce cubilia malesuada, hendrerit ad porttitor ac neque netus dictum, felis suscipit est nisl parturient porta elementum. Ullamcorper tempor porttitor quis integer nullam proin taciti facilisis eget dui habitasse, nisl ad erat placerat curae dictum litora lectus urna facilisi, varius tincidunt nam enim lacus tellus est suspendisse porta cum. + +Conubia a rhoncus metus felis nullam dictumst tempus dignissim, egestas neque pulvinar tincidunt feugiat congue suscipit elementum, hac sociosqu fringilla nunc bibendum magna curae. Netus massa suspendisse tellus sapien a montes, metus varius aenean mauris tempus dis fames, tincidunt eu vulputate quis pulvinar. Lobortis curabitur molestie tortor aliquam posuere magnis consequat, tellus suspendisse purus pretium ultricies nibh fermentum, potenti odio egestas tempus varius id. + +Placerat fames proin suspendisse porta posuere quam orci senectus integer sed, nostra diam elementum phasellus vulputate dictum litora accumsan platea, sociosqu morbi dictumst nascetur parturient lacinia cubilia blandit pretium. Felis nostra natoque facilisis taciti diam nam netus est malesuada, tellus accumsan montes arcu lacinia et dictum rhoncus commodo, cum purus dui maecenas egestas sollicitudin eu risus. Augue ullamcorper penatibus at curae urna hac habitant suspendisse fringilla platea, fames sed fermentum sociis etiam sapien ac dictum maecenas cras, volutpat nullam tempus ornare leo ultricies lobortis mus arcu. Velit consequat fermentum facilisis eleifend vestibulum ullamcorper platea mi faucibus potenti sagittis nisl, himenaeos volutpat pellentesque nascetur gravida tempus interdum enim tristique sed curabitur mollis, commodo magnis facilisi tempor ultrices vehicula vel nisi metus iaculis varius. + +Rhoncus aliquet fermentum imperdiet senectus porttitor vulputate pharetra tortor, feugiat suscipit proin magnis cubilia primis magna urna, blandit facilisis cum aenean purus curabitur platea. Pharetra dis vivamus cursus proin hendrerit faucibus himenaeos praesent, mus facilisi sodales sed curabitur scelerisque aliquam, aenean velit platea mollis ultrices integer tincidunt. Montes vivamus phasellus tempus tellus a fermentum habitant hendrerit parturient ligula mollis et, varius dapibus sed cras nam libero blandit eu vestibulum laoreet nunc, porttitor ut pretium curae dictum id justo erat nisl nisi integer. + +Ultrices iaculis per netus odio condimentum molestie penatibus nibh, ultricies faucibus cras sagittis neque ante pulvinar, justo ad ullamcorper at malesuada tellus nisl. Porttitor lacinia vestibulum ut condimentum donec, blandit ullamcorper euismod fringilla pharetra id, natoque lectus pretium vel. Sodales elementum sed est himenaeos ligula luctus porta montes cum, integer eu vivamus volutpat viverra pulvinar orci faucibus nostra, maecenas neque magnis dis nulla habitant metus velit. Urna quam a enim scelerisque pretium taciti vestibulum quisque dignissim, suspendisse nisl habitasse turpis accumsan nec pellentesque inceptos, tempor aptent ad sollicitudin velit praesent porttitor facilisis. + +Lacus dui velit mus ut cursus ridiculus montes, id vehicula vivamus taciti egestas urna vulputate, rutrum dapibus aptent non ullamcorper aliquet. Eget erat dictum montes facilisis sodales nascetur ante quisque, mattis venenatis penatibus senectus ultricies praesent himenaeos, aliquam porttitor accumsan diam quis platea et. Habitasse donec parturient lectus vehicula non magnis quis et ante netus, natoque proin lacus posuere commodo nisl eget placerat sed aenean, imperdiet lobortis volutpat massa cubilia curae metus nisi blandit. Viverra tortor suspendisse aenean nisl pretium augue, parturient vestibulum dignissim tristique quis, neque ultricies ad quisque lacinia. + +Condimentum nisl mus pulvinar semper metus placerat habitasse commodo aptent, fermentum eros mollis inceptos venenatis ut natoque id hac magna, per ornare penatibus conubia tellus sed erat mi. Etiam felis enim inceptos libero facilisis dis litora imperdiet cursus netus, sapien accumsan in turpis facilisi fermentum mus dictumst fames, bibendum aptent metus habitasse tempus condimentum ante augue volutpat. Pulvinar inceptos sociis elementum blandit facilisi natoque eu, mollis neque lacus aliquet tristique massa habitasse, mus praesent vestibulum augue porta nisl. Quisque porta vestibulum sociis ad vulputate felis conubia lacus enim, sociosqu libero luctus condimentum nibh parturient et lobortis, egestas mauris proin tempus montes pulvinar senectus dictum. + +Pharetra habitasse praesent tristique taciti dignissim nullam faucibus mus at, curabitur inceptos libero accumsan facilisis tempus duis mi ut, massa magnis vitae metus est magna placerat nam, convallis aliquet sed auctor ullamcorper gravida rhoncus aptent. Platea aenean sagittis per fringilla mollis auctor rhoncus, blandit magna aptent egestas himenaeos tincidunt malesuada eget, luctus ad massa vulputate sapien pulvinar. Senectus scelerisque gravida viverra morbi metus augue suspendisse, pulvinar maecenas urna dictum nascetur cursus, sem ultricies curae enim parturient accumsan. + +Nunc cubilia fusce ullamcorper senectus vulputate pellentesque natoque ac, taciti tortor nisl torquent quis posuere mus. Vel dignissim nulla imperdiet accumsan aliquet faucibus hendrerit ultricies neque vivamus, tempus feugiat praesent sodales rhoncus taciti congue ad dis velit, orci himenaeos quis hac suscipit litora ornare senectus dui. Inceptos nec condimentum viverra et augue lectus nunc diam, eros dis purus magna nullam ligula ultrices tortor, velit aenean tellus id porttitor faucibus volutpat. Quisque blandit gravida integer sociosqu est accumsan pulvinar, nullam condimentum conubia vulputate cursus netus iaculis, urna a habitant scelerisque aptent torquent. Vulputate himenaeos class malesuada tortor interdum velit potenti quisque risus pharetra, primis cum lectus mi ullamcorper sociosqu consequat posuere nisi, varius eleifend arcu id eget vel nullam etiam blandit. + +Quisque vestibulum proin torquent vel dictum convallis ligula placerat suspendisse enim, tristique lobortis sem feugiat libero lacus parturient tempus volutpat, habitasse imperdiet sociosqu mi dapibus scelerisque sollicitudin ullamcorper et. Euismod scelerisque mauris augue lacus porttitor cras ornare penatibus, nascetur egestas placerat platea cubilia varius volutpat duis malesuada, quisque mus ridiculus habitant senectus suscipit morbi. Ultrices leo cras morbi magna curabitur potenti vel mi, non hac varius imperdiet id metus ornare, nullam in quis dapibus torquent eros rhoncus. Nullam dapibus quisque luctus sollicitudin lacus euismod porta pulvinar sapien rutrum est, feugiat mollis nec ridiculus aenean sem tristique massa suspendisse faucibus. + +Bibendum tempor congue sed curabitur non quam velit porta, mauris montes mattis mollis sodales vivamus sociosqu tempus, himenaeos penatibus taciti commodo in id maecenas. Vulputate pretium mauris at viverra mus massa vehicula parturient, conubia velit tempus eleifend libero bibendum curabitur in ultricies, hendrerit tincidunt consequat porttitor justo commodo id. Neque congue sociosqu morbi massa libero aliquet purus nibh conubia, venenatis diam mauris justo mollis felis fusce tempus quis, suspendisse gravida blandit viverra bibendum euismod porttitor placerat." + ] { + let reference = Codebook::from_sequence(sample.bytes(), BitLen::new(std::u8::MAX)).unwrap(); + + struct ByteAlphabet; + impl Alphabet for ByteAlphabet { + type Symbol = u8; + fn read_literal(mut input: R) -> Result + where R: io::Read { + let mut buf = [0]; + input.read_exact(&mut buf)?; + Ok(buf[0]) + } + fn write_literal(symbol: &Self::Symbol, mut output: W) -> Result<(), io::Error> + where + W: io::Write + { + output.write_all(&[*symbol])?; + Ok(()) + } + } + + { + // Test as a static alphabet. + impl StaticAlphabet for ByteAlphabet { + fn len() -> u32 { + std::u8::MAX as u32 + } + fn symbol(index: u32) -> Option { + index.try_into().ok() + } + } + + // ...write + let mut buf = vec![]; + reference.write_static::(&mut buf) + .unwrap(); + + // ...read + let result = Codebook::read_static::(io::Cursor::new(&buf)) + .unwrap(); + + assert_eq!(result, reference); + } + + { + // Test as a dynamic alphabet. + impl DynamicAlphabet for ByteAlphabet { } + + // ...write + let mut buf = vec![]; + reference.write_dynamic::(&mut buf) + .unwrap(); + + + // ...read + let result = Codebook::read_dynamic::(io::Cursor::new(&buf)) + .unwrap(); + + assert_eq!(result, reference); + } + } +} diff --git a/crates/binjs_io/src/context/huffman/mod.rs b/crates/binjs_io/src/context/huffman/mod.rs new file mode 100644 index 000000000..282148b51 --- /dev/null +++ b/crates/binjs_io/src/context/huffman/mod.rs @@ -0,0 +1,379 @@ +use io::statistics::Instances; + +use std::borrow::Cow; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, HashMap}; +use std::hash::Hash; +use std::io; + +/// Huffman trees. +mod codebook; + +/// Reading from bitstreams and decoding their contents using Huffman tables. +pub mod read; + +/// A newtype for `u8` used to count the length of a key in bits. +#[derive( + Constructor, + Debug, + Default, + Display, + Serialize, + Deserialize, + From, + Into, + Add, + AddAssign, + Sub, + SubAssign, + Clone, + Copy, + PartialOrd, + Ord, + PartialEq, + Eq, +)] +pub struct BitLen(u8); +impl BitLen { + pub fn as_u8(&self) -> u8 { + self.0 + } +} + +/// The maximal number of bits permitted in a Huffman key +/// in this format. +pub const MAX_CODE_BIT_LEN: BitLen = BitLen(20); + +/// Convenience implementation of operator `<<` in +/// `bits << bit_len` +impl std::ops::Shl for u32 { + type Output = u32; + fn shl(self, rhs: BitLen) -> u32 { + self << Into::::into(rhs) + } +} +impl std::ops::Shl for usize { + type Output = usize; + fn shl(self, rhs: BitLen) -> usize { + self << Into::::into(rhs) + } +} + +/// Convenience implementation of operator `>>` in +/// `bits >> bit_len` +impl std::ops::Shr for u32 { + type Output = u32; + fn shr(self, rhs: BitLen) -> u32 { + if rhs.as_u8() == 32 { + return 0; + } + self >> rhs.as_u8() + } +} +impl std::ops::Shr for usize { + type Output = usize; + fn shr(self, rhs: BitLen) -> usize { + if rhs.as_u8() == 32 { + return 0; + } + self >> rhs.as_u8() + } +} + +/// The largerst acceptable length for a key. +/// +/// Hardcoded in the format. +const MAX_CODE_BIT_LENGTH: u8 = 20; + +/// A sequence of bits, read from a bit stream. +/// +/// Typically used for lookup of entries in Huffman tables. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct BitSequence { + bits: u32, + bit_len: BitLen, +} +impl BitSequence { + pub fn new(bits: u32, bit_len: BitLen) -> Self { + assert!(bit_len.as_u8() <= 32); + assert_eq!(bits >> bit_len, 0); + Self { bits, bit_len } + } + + pub fn bits(&self) -> u32 { + self.bits + } + + /// The number of bits of `bits` to use. + pub fn bit_len(&self) -> BitLen { + self.bit_len + } + + /// Split the bits into a prefix of `bit_len` bits and a suffix containing the + /// remaining bits. + /// + /// If `bit_len` is larger than the number of bits, the prefix is padded with + /// lower-weight bits into `bit_len` bits. + pub fn split_raw_bits(&self, bit_len: BitLen) -> (u32, u32) { + debug_assert!(bit_len.as_u8() <= 32); + if self.bit_len <= bit_len { + let padding = bit_len - self.bit_len; + (self.bits << padding, 0) + } else { + let shift: BitLen = self.bit_len - bit_len; + let co_shift: BitLen = BitLen::new(32) - shift; + (self.bits >> shift, self.bits & (std::u32::MAX >> co_shift)) + } + } + + /// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len` + /// bits. + /// + /// # Failure + /// + /// This function panics if `bit_len > self.bit_len`. + pub fn split(&self, bit_len: BitLen) -> (BitSequence, BitSequence) { + let (prefix, suffix) = self.split_raw_bits(bit_len); + ( + BitSequence::new(prefix, bit_len), + BitSequence::new( + suffix, + if self.bit_len >= bit_len { + self.bit_len - bit_len + } else { + BitLen::new(0) + }, + ), + ) + } + + /// Add lowest-weight to this bit sequence bits until it reaches + /// a sufficient bit length. + /// + /// Does nothing if the bit sequence already has a sufficient bitlength. + pub fn pad_lowest_to(&self, total_bit_len: BitLen) -> Cow { + assert!(total_bit_len.as_u8() <= 32); + if total_bit_len <= self.bit_len { + return Cow::Borrowed(self); + } + let shift: BitLen = total_bit_len - self.bit_len; + Cow::Owned(BitSequence::new(self.bits << shift, total_bit_len)) + } + + /// Return a range representing all possible suffixes of this `BitSequence` + /// containing exactly `bit_len` bits. + /// + /// If this `BitSequence` is already at least `bit_len` bits long, we + /// truncate the `BitSequence` to `bit_len` bits by removing the + /// lower-weight bits and there is only one such suffix. + /// + /// ``` + /// use binjs_io::context::huffman::{ BitLen, BitSequence }; + /// + /// let zero = BitSequence::new(0, BitLen::new(0)); + /// + /// let range = zero.suffixes(BitLen::new(0)); + /// assert_eq!(range, 0..1); + /// + /// let range = zero.suffixes(BitLen::new(2)); + /// assert_eq!(range, 0..4); + /// + /// let range = zero.suffixes(BitLen::new(3)); + /// assert_eq!(range, 0..8); + /// + /// let range = zero.suffixes(BitLen::new(4)); + /// assert_eq!(range, 0..16); + /// + /// let sequence = BitSequence::new(0b00000100, BitLen::new(3)); + /// + /// let range = sequence.suffixes(BitLen::new(0)); + /// assert_eq!(range, 0..1); + /// + /// let range = sequence.suffixes(BitLen::new(2)); + /// assert_eq!(range, 2..3); + /// + /// let range = sequence.suffixes(BitLen::new(3)); + /// assert_eq!(range, 4..5); + /// + /// let range = sequence.suffixes(BitLen::new(4)); + /// assert_eq!(range, 8..10); // 0b000001000 to 0b00001001 included + /// ``` + pub fn suffixes(&self, bit_len: BitLen) -> std::ops::Range { + debug_assert!(bit_len.as_u8() as usize <= 8 * std::mem::size_of_val(&self.bits())); + debug_assert!( + std::mem::size_of_val(&self.bits()) == std::mem::size_of::(), + "The arithmetics relies upon the fact that we're only using `u32` for Huffman keys" + ); + let (first, last) = if bit_len <= self.bit_len() { + // We have too many bits, we need to truncate the bits, + // then return a single element. + let shearing: BitLen = self.bit_len() - bit_len; + let first = self.bits() >> shearing; + (first, first) + } else { + // We need to pad with lower-weight 0s. + let padding: BitLen = bit_len - self.bit_len(); + let co_padding = BitLen::new(32) - padding; + let first = self.bits() << padding; + let len = std::u32::MAX >> co_padding; + (first, first + len) + }; + first..(last + 1) + } +} + +#[test] +fn test_bit_sequence_split() { + let bits = 0b11111111_11111111_00000000_00000000; + let key = BitSequence::new(bits, BitLen(32)); + assert_eq!(key.split_raw_bits(BitLen(0)), (0, bits)); + assert_eq!(key.split_raw_bits(BitLen(32)), (bits, 0)); + assert_eq!(key.split_raw_bits(BitLen(16)), (0b11111111_11111111, 0)); + + let bits = 0b00000000_00000000_00000000_11111111; + let key = BitSequence::new(bits, BitLen(16)); + assert_eq!(key.split_raw_bits(BitLen(0)), (0, bits)); + assert_eq!(key.split_raw_bits(BitLen(16)), (bits, 0)); + assert_eq!(key.split_raw_bits(BitLen(8)), (0, 0b11111111)); +} + +/// A Huffman key +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Key(BitSequence); + +impl Key { + /// Create a new Key. + /// + /// Note that we only use the `bit_len` lowest-weight bits. + /// + /// # Failure + /// + /// - Panic if any bit other than the `bit_len` lowest-weight bits is 0. + /// - Panic if the bit length is greater than 20. + pub fn new(bits: u32, bit_len: BitLen) -> Self { + assert!(bit_len <= BitLen::new(20)); + Self::try_new(bits, bit_len).expect("Invalid Key") + } + + /// Create a new Key. + /// + /// Note that we only use the `bit_len` lowest-weight bits. + /// Any other bit MUST BE 0. + pub fn try_new(bits: u32, bit_len: BitLen) -> Result { + // May the value fit in a `Key`? + if bit_len.as_u8() > 32 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "bitlength exceeds Key capacity", + )); + } + // Are the heavy-weight bits 0s, as expected? + if bit_len.as_u8() < 32 { + if bits >> bit_len != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid Key content", + )); + } + } + Ok(Key(BitSequence { bits, bit_len })) + } + + pub fn from_bit_sequence(sequence: BitSequence) -> Self { + Self::new(sequence.bits, sequence.bit_len) + } + + /// The bits in this Key. + /// + /// # Invariant + /// + /// Only the `self.bit_len()` lowest-weight bits may be non-0. + pub fn bits(&self) -> u32 { + self.0.bits + } + + /// The number of bits of `bits` to use. + pub fn bit_len(&self) -> BitLen { + self.0.bit_len + } + + pub fn as_bit_sequence(&self) -> &BitSequence { + &self.0 + } +} + +/// A node in the Huffman tree. +struct Node { + /// The total number of instances of all `NodeContent::Leaf(T)` in this subtree. + instances: Instances, + + /// The content of the node. + content: NodeContent, +} + +/// Contents of a node in the Huffman tree. +enum NodeContent { + /// A value from the stream of values. + Leaf(T), + + /// An internal node obtained by joining two subtrees. + Internal { + left: Box>, + right: Box>, + }, +} + +/// Custom ordering of `NodeContent`. +/// +/// We compare *only* by number of instances. +impl PartialOrd for Node { + fn partial_cmp(&self, other: &Self) -> Option { + self.instances.partial_cmp(&other.instances) + } +} +impl Ord for Node { + fn cmp(&self, other: &Self) -> Ordering { + self.instances.cmp(&other.instances) + } +} +impl PartialEq for Node { + fn eq(&self, other: &Self) -> bool { + self.instances.eq(&other.instances) + } +} +impl Eq for Node {} + +/// An alphabet of symbols. +pub trait Alphabet { + type Symbol: Ord + Clone; + + /// Read a symbol from an input stream. + fn read_literal(input: R) -> Result + where + R: io::Read; + + fn write_literal(symbol: &Self::Symbol, output: W) -> Result<(), io::Error> + where + W: io::Write; +} + +/// An alphabet of symbols known statically from the grammar. +/// Also known as `Implicit Symbols` in the grammar. +/// +/// For instance, in most languages, there is a finite set of +/// arithmetic operators specified by the grammar. +pub trait StaticAlphabet: Alphabet { + /// The number of symbols in this static alphabet. + fn len() -> u32; + + /// Return the nth value of the alphabet of `None` if there is no such value. + fn symbol(u32) -> Option; +} + +/// An alphabet of symbols known dynamically from the file. +/// Also known as `Explicit Symbols` in the grammar. +/// +/// For instance, in most languages, the set of literal strings +/// actually used in a file is determined by the user, not by +/// the grammar. +pub trait DynamicAlphabet: Alphabet {} diff --git a/crates/binjs_io/src/context/huffman/read.rs b/crates/binjs_io/src/context/huffman/read.rs new file mode 100644 index 000000000..2320a10fe --- /dev/null +++ b/crates/binjs_io/src/context/huffman/read.rs @@ -0,0 +1,600 @@ +//! Huffman tables for reading. +//! +//! These tables are designed to aid decoding from sequences of bits +//! into values. + +use context::huffman::codebook::*; +use context::huffman::*; + +use std::convert::{TryFrom, TryInto}; + +/// A Huffman table. +/// +/// We have several implementations of HuffmanTable designed for +/// distinct space/speed tradeoffs. +pub trait HuffmanTable +where + T: Clone, +{ + /// Return the number of elements in the table. + fn len(&self) -> usize; + + /// Return bit length of the table with most elements. + fn highest_bit_len(&self) -> BitLen; + + /// Lookup a value from a sequence of bits. + /// + /// The sequence of bits MUST be at least as long as `highest_bit_len`. + /// Use the `Key` result to determine how many bits need to actually be + /// consumed from the bit stream. + fn lookup(&self, key: &BitSequence) -> Option>; +} + +/// A type that has a maximal value. +pub trait ValueIndex: TryFrom + TryInto + Clone { + fn max_value() -> Self; +} +impl ValueIndex for u8 { + fn max_value() -> u8 { + std::u8::MAX + } +} +impl ValueIndex for u32 { + fn max_value() -> u32 { + std::u32::MAX + } +} +impl ValueIndex for usize { + fn max_value() -> usize { + std::usize::MAX + } +} + +/// An implementation of Huffman Tables as a vector designed to allow +/// constant-time lookups at the expense of high space complexity. +/// +/// Type parameter `V` is the internal type of indices. Instantiating +/// with `V = u8` will provide the maximal speed and space-efficiency +/// but will only work if the table contains at most 2^8 values. +/// Alternatively, you may instantiate with `u32` or `usize` for +/// larger tables. +/// +/// # Time complexity +/// +/// Lookups take constant time, which essentially consists in two +/// simple vector lookups. +/// +/// # Space complexity +/// +/// After initialization, a `SingleLookupHuffmanTable` +/// requires O(2 ^ max bit length in the table) space: +/// +/// - A vector `values` containing one entry per symbol. +/// - A vector `saturated` containing exactly 2 ^ (max bit length in the +/// table) entries, which we use to map any combination of `maxBitLength` +/// bits onto the only `HuffmanEntry` that may be reached by a prefix +/// of these `maxBitLength` bits. See below for more details. +/// +/// # Algorithm +/// +/// Consider the following Huffman table +/// +/// Symbol | Binary Code | Int value of Code | Bit Length +/// ------ | ------------ | ----------------- | ---------- +/// A | 11000 | 24 | 5 +/// B | 11001 | 25 | 5 +/// C | 1101 | 13 | 4 +/// D | 100 | 4 | 3 +/// E | 101 | 5 | 3 +/// F | 111 | 7 | 3 +/// G | 00 | 0 | 2 +/// H | 01 | 1 | 2 +/// +/// By definition of a Huffman Table, the Binary Codes represent +/// paths in a Huffman Tree. Consequently, padding these codes +/// to the end would not change the result. +/// +/// Symbol | Binary Code | Int value of Code | Bit Length +/// ------ | ------------ | ----------------- | ---------- +/// A | 11000 | 24 | 5 +/// B | 11001 | 25 | 5 +/// C | 1101? | [26...27] | 4 +/// D | 100?? | [16...19] | 3 +/// E | 101?? | [20..23] | 3 +/// F | 111?? | [28..31] | 3 +/// G | 00??? | [0...7] | 2 +/// H | 01??? | [8...15] | 2 +/// +/// Row "Int value of Code" now contains all possible values +/// that may be expressed in 5 bits. By using these values +/// as array indices, we may therefore represent the +/// Huffman table as an array: +/// +/// Index | Symbol | Bit Length +/// --------- | ---------- | ------------- +/// [0...7] | G | 2 +/// [8...15] | H | 2 +/// [16...19] | D | 3 +/// [20...23] | E | 3 +/// 24 | A | 5 +/// 25 | B | 5 +/// [26...27] | C | 4 +/// [28...31] | F | 3 +/// +/// By using the next 5 bits in the bit buffer, we may, in +/// a single lookup, determine the symbol and the bit length. +/// +/// In the current implementation, to save some space, we have +/// two distinct arrays, one (`values`) with a single instance of each +/// symbols bit length, and one (`saturated`) with indices into that +/// array. +#[derive(Debug)] +pub struct SingleLookupHuffmanTable { + highest_bit_len: BitLen, + saturated: Vec, + values: Vec<(T, Key)>, +} +impl SingleLookupHuffmanTable +where + V: ValueIndex, +{ + /// Construct a Huffman table from a Codebook. + /// + /// Time complexity: `O(2^codebook.max_bit_len())`. + pub fn from_codebook(codebook: Codebook) -> Self { + assert!( + codebook.len() + <= V::max_value() + .try_into() + .unwrap_or_else(|_| panic!("Too many keys for ValueIndex")) + ); + let highest_bit_len = codebook.highest_bit_len(); + + let mut values = Vec::with_capacity(codebook.len()); + + // Fill `saturated` with a default value of `V::max_value()`. + // This is the value most likely to trigger errors in case + // we have a bug in the implementation of `SingleLookupHuffmanTable` + // or if the data provided is inconsistent. + let mut saturated = Vec::with_capacity(1usize << highest_bit_len); + saturated.resize(1usize << highest_bit_len, V::max_value()); + + for (value_index, (value, key)) in codebook.into_iter().enumerate() { + let value_index: V = value_index + .try_into() + .unwrap_or_else(|_| panic!("Too many keys for ValueIndex")); + + // When we perform lookup, we will extract `highest_bit_len` bits from the key + // into a value `0bB...B`. We have a match for `value` if and only if + // `0bB...B` may be decomposed into `0bC...CX...X` such that + // - `0bC...C` is `bit_len` bits long; + // - `0bC...C == bits`. + // + // To perform a fast lookup, we precompute all possible values of `0bB...B` + // for which this condition is true. That's all the values of segment + // `[0bC...C0...0, 0bC...C1...1]`. + let padding = highest_bit_len - key.bit_len(); + assert!(padding.as_u8() < 32); + + // `seg_begin` holds `0bC...C0...0` above + let seg_begin = (key.bits() << padding) as usize; + + // `seg_len` holds `0bC...C1...1` - `0bC...C0...0` + let seg_len: usize = if padding.as_u8() == 0 { + 0 + } else { + let shift: u8 = + u8::checked_sub(8 * std::mem::size_of::() as u8, padding.into()) + .unwrap(); + std::usize::MAX >> shift + } + 1; + for entry in &mut saturated[seg_begin..seg_begin + seg_len] { + *entry = value_index.clone(); + } + + values.push((value, key)); + } + + Self { + highest_bit_len, + saturated, + values, + } + } +} + +impl HuffmanTable for SingleLookupHuffmanTable +where + V: ValueIndex, + T: Clone, +{ + /// Constant time length access. + fn len(&self) -> usize { + self.values.len() + } + + /// Constant time highest bit access. + fn highest_bit_len(&self) -> BitLen { + self.highest_bit_len + } + + /// Constant-time lookup. + fn lookup(&self, key: &BitSequence) -> Option> { + assert!(key.bit_len() >= self.highest_bit_len()); + let (prefix, _) = key.split_raw_bits(self.highest_bit_len()); + let value_index = self.saturated[prefix as usize].clone(); + let value_index: usize = value_index + .try_into() + .unwrap_or_else(|_| panic!("Value index does not fit into a usize")); + let entry = self.values.get(value_index)?; + Some(Cow::Borrowed(entry)) + } +} + +/// An alias for `SingleLookupHuffmanTable::from_codebook`, meant mainly to be used in +/// `MultiLookupHuffmanTable::from_codebook`. +impl From> for SingleLookupHuffmanTable +where + V: ValueIndex, +{ + fn from(codebook: Codebook) -> Self { + Self::from_codebook(codebook) + } +} + +/// A table designed to support fast lookup in large sets of data. +/// In most cases, lookup will be slower than a `SingleLookupHuffmanTable` +/// but, particularly in heavily unbalanced trees, the table will +/// take ~2^prefix_len fewer internal entries than a `SingleLookupHuffmanTable`. +/// +/// Typically, use this table whenever codes range between 10 and 20 bits. +/// +/// # Time complexity +/// +/// Assuming that lookups in `Subtable` take constant time, a lookup in `MultiLookupHuffmanTable` +/// will also take constant time: +/// +/// - a constant-time lookup to determine into which Subtable to perform the lookup; +/// - a constant-time lookup into Subtable; +/// - a final constant-time lookup to extract the result. // FIXME: We could get rid of this final lookup. +/// +/// +/// # Space complexity +/// +/// TBD. Highly dependent on the shape of the Huffman Tree. +/// +/// +/// # Algorithm +/// +/// Consider the following Huffman table +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ------------ | ---------- +/// A | 11000 | 5 +/// B | 11001 | 5 +/// C | 1101 | 4 +/// D | 100 | 3 +/// E | 101 | 3 +/// F | 111 | 3 +/// G | 00 | 2 +/// H | 01 | 2 +/// +/// With a prefix length of 3, we will precompute all possible 3-bit prefixes +/// and split the table across such prefixes. +/// +/// Prefix | Int Value of Prefix | Symbols | Max bit length +/// ------ | ------------------- | --------- | -------------- +/// 000 | 0 | G | 0 +/// 001 | 1 | G | 0 +/// 010 | 2 | H | 0 +/// 011 | 3 | H | 0 +/// 100 | 4 | D | 0 +/// 101 | 5 | E | 0 +/// 110 | 6 | A, B, C | 2 +/// 111 | 7 | F | 0 +/// +/// For each prefix, we build the table containing the Symbols, +/// stripping prefix from the Binary Code. +/// +/// - Prefix 000 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// G | (none) | 0 +/// +/// - Prefix 001 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// G | (none) | 0 +/// +/// - Prefix 010 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// H | (none) | 0 +/// +/// - Prefix 11 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// H | (none) | 0 +/// +/// - Prefix 100 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// D | (none) | 0 +/// +/// - Prefix 101 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// E | (none) | 0 +/// +/// - Prefix 110 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// A | 00 | 2 +/// B | 01 | 2 +/// C | 1 | 1 +/// +/// - Prefix 111 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// F | (none) | 0 +/// +/// With this transformation, we have represented one table +/// with an initial max bit length of 5 as: +/// +/// - 1 table with a max bit length of 2; +/// - 7 tables with a max bit length of 0. +/// +/// Consequently, instead of storing 2^5 = 32 internal references, +/// as we would have done with a SingleLookupHuffmanTable, we only +/// need to store (assuming that `SubTable` is a `SingleLookupHuffmanTable`): +/// +/// - 7 subtables with 1 reference each; +/// - 1 subtable with 2^2 = 4 references. +pub struct MultiLookupHuffmanTable { + /// The highest bit length. + highest_bit_len: BitLen, + + /// Invariant: `prefix_len < highest_bit_len`. + prefix_len: BitLen, + + /// A mapping from 0..2^prefix_len such that index `i` + /// maps to a subtable that holds all values associated + /// with a key that starts with `Key::new(i, prefix_len)`. + /// + /// Note that, to allow the use of smaller tables, keys + /// inside the subtables have been stripped + /// from the prefix `Key::new(i, prefix_len)`. + by_prefix: Vec, + + /// The number of entries in this table. + len: usize, + + values: Vec<(T, Key)>, +} + +impl MultiLookupHuffmanTable +where + SubTable: HuffmanTable + From>, + T: Clone, +{ + pub fn from_codebook(prefix_len: BitLen, codebook: Codebook) -> Self { + let len = codebook.len(); + let mut values = Vec::with_capacity(codebook.len()); + let highest_bit_len = codebook.highest_bit_len(); + + // At this stage, we cannot immediately create subtables, as + // we first need to determine the `highest_bit_len`. So we + // first need to split our Codebook into a forest of Codebooks + // sharing the same prefix. + let mut buckets = Vec::with_capacity(1usize << prefix_len); + buckets.resize_with(1usize << prefix_len, || Codebook::new()); + + // Dispatch each (value, key) to its buckets. + for (value, key) in codebook.into_iter() { + let (prefix, suffix) = key.as_bit_sequence().split(prefix_len); + for index in prefix.suffixes(prefix_len) { + let ref mut bucket = buckets[index as usize]; + // Store the new mapping: + // - in the smaller Codebook, we only need the remaining bits (`suffix`); + // - in the smaller Codebook, we don't use the `value` itself but rather + // a reference to value stored in `values`. + unsafe { + bucket.add_mapping(values.len(), Key::from_bit_sequence(suffix.clone())); + } + } + values.push((value, key)); + } + + // Now convert buckets into Huffman tables + let mut by_prefix = Vec::with_capacity(1usize << prefix_len); + for bucket in buckets { + by_prefix.push(SubTable::from(bucket)); + } + + Self { + highest_bit_len, + prefix_len, + by_prefix, + len, + values, + } + } +} + +impl HuffmanTable for MultiLookupHuffmanTable +where + SubTable: HuffmanTable, + T: Clone, +{ + /// Constant-time length. + fn len(&self) -> usize { + self.len + } + + /// Constant time highest bit length. + fn highest_bit_len(&self) -> BitLen { + self.highest_bit_len + } + + /// Constant-time lookup. + fn lookup(&self, key: &BitSequence) -> Option> + where + T: Clone, + { + assert!(key.bit_len() >= self.highest_bit_len()); + + // Find in which `SingleLookupHuffmanTable` to look for the entry. + let (prefix, suffix) = key.split_raw_bits(self.prefix_len); + let ref table = self.by_prefix.get(prefix as usize)?; + + // Now lookup in second table. + let suffix = BitSequence::new(suffix, key.bit_len() - self.prefix_len); + let suffix = suffix.pad_lowest_to(table.highest_bit_len()); + let lookup = table.lookup(&suffix)?; + + // Finally, build the result. + Some(Cow::Borrowed(&self.values[lookup.0])) + } +} + +#[test] +fn test_huffman_lookup() { + // Check against a hardcoded constant, to ensure consistency + // with fbssdc implementation. + + fn run_test(from_codebook: F) + where + F: Fn(Codebook) -> H, + H: HuffmanTable, + { + let sample = "appl"; + let codebook = Codebook::from_sequence(sample.chars(), BitLen::new(std::u8::MAX)).unwrap(); + let table = from_codebook(codebook); + + assert_eq!(table.len(), 3); + + // Test with all possible 2 bit sequences. + let candidate = BitSequence::new(0b10, BitLen(2)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result.as_ref(), &('a', Key::new(0b10, BitLen(2)))); + + let candidate = BitSequence::new(0b11, BitLen(2)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result.as_ref(), &('l', Key::new(0b11, BitLen(2)))); + + // With a bit length of 2, there are two keys that + // should return 'p' + for prefix in &[0b00, 0b01] { + let candidate = BitSequence::new(*prefix, BitLen(2)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result.as_ref(), &('p', Key::new(0, BitLen(1)))); + } + + // Test values with all possible 3 bit sequences. + for prefix in &[0b100, 0b101] { + let candidate = BitSequence::new(*prefix, BitLen(3)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result.as_ref(), &('a', Key::new(0b10, BitLen(2)))); + } + + for prefix in &[0b110, 0b111] { + let candidate = BitSequence::new(*prefix, BitLen(3)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result.as_ref(), &('l', Key::new(0b11, BitLen(2)))); + } + + for prefix in &[0b000, 0b001, 0b010, 0b011] { + let candidate = BitSequence::new(*prefix, BitLen(3)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result.as_ref(), &('p', Key::new(0, BitLen(1)))); + } + } + + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::>, _>(|codebook| { + MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook) + }); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook), + ); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook), + ); +} + +#[test] +fn test_huffman_lookup_2() { + // Check internal consistency. + + fn run_test(from_codebook: F) + where + F: Fn(Codebook) -> H, + H: HuffmanTable, + { + let sample = "Lorem ipsum dolor sit amet consectetur adipiscing elit convallis nostra, integer diam odio mus eros ut sodales sociis cursus, montes imperdiet morbi rhoncus felis venenatis curabitur magna. Volutpat tincidunt sociosqu pharetra id feugiat enim eget, integer quisque magna in senectus mollis, himenaeos malesuada convallis faucibus ornare egestas. Netus platea himenaeos suscipit nostra montes mattis, lobortis ut arcu facilisi hac ornare, integer ante sociosqu placerat morbi. + +Viverra arcu dapibus nam magna a imperdiet inceptos cubilia libero lobortis praesent habitasse, tortor id leo consequat sollicitudin elementum fames fringilla himenaeos donec. Phasellus posuere congue ultricies scelerisque senectus vivamus facilisi, vestibulum consequat aptent lectus ad sociis porta, purus libero eros leo at nec. Netus viverra urna nisl sapien conubia porta sed luctus penatibus cras, pulvinar iaculis sagittis fusce fringilla et rutrum sollicitudin ligula, dui vestibulum interdum pretium montes diam nibh inceptos ante. +"; + let codebook = Codebook::from_sequence(sample.chars(), BitLen::new(std::u8::MAX)).unwrap(); + let table = from_codebook(codebook.clone()); + for (value, key) in codebook { + // Test that candidate keys obtained by extending `key` with additional bits + // return the expected `(value, key)`. + for bit_len in table.highest_bit_len().as_u8() + ..=std::cmp::min(table.highest_bit_len().as_u8() + 5, 32) + { + let candidate = key.as_bit_sequence().pad_lowest_to(BitLen(bit_len)); + let lookup = table.lookup(&candidate).expect("Lookup value not found"); + assert_eq!(lookup.0, value); + assert_eq!(lookup.1, key); + } + } + } + // Test with a single lookup. + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + + // Test with two lookups, with a very short prefix length. + run_test::>, _>(|codebook| { + MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook) + }); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook), + ); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook), + ); + + // Test with two lookups, still with a very short prefix length. + run_test::>, _>(|codebook| { + MultiLookupHuffmanTable::from_codebook(BitLen(2), codebook) + }); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(2), codebook), + ); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(2), codebook), + ); + + // Test with two lookups, with an unreasonably large prefix length. + run_test::>, _>(|codebook| { + MultiLookupHuffmanTable::from_codebook(BitLen(10), codebook) + }); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(10), codebook), + ); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(10), codebook), + ); +} diff --git a/crates/binjs_io/src/context/mod.rs b/crates/binjs_io/src/context/mod.rs index cddbfd5ba..3f12bd35c 100644 --- a/crates/binjs_io/src/context/mod.rs +++ b/crates/binjs_io/src/context/mod.rs @@ -3,7 +3,7 @@ /// Format documentation. pub mod format; -mod huffman; +pub mod huffman; mod strings; mod varnum; diff --git a/crates/binjs_io/src/context/varnum.rs b/crates/binjs_io/src/context/varnum.rs index b6f3426dc..a737d3a51 100644 --- a/crates/binjs_io/src/context/varnum.rs +++ b/crates/binjs_io/src/context/varnum.rs @@ -25,6 +25,17 @@ pub struct ByteValue { /// The number of bytes consumed. pub byte_len: usize, } +impl ByteValue { + /// The value read. + pub fn value(&self) -> &T { + &self.value + } + + /// The number of bytes consumed. + pub fn byte_len(&self) -> usize { + self.byte_len + } +} /// A reader that may read varu32-encoded u32 values from a stream. pub trait ReadVaru32 { diff --git a/crates/binjs_io/src/io/statistics.rs b/crates/binjs_io/src/io/statistics.rs index cc39ab5a8..e12205365 100644 --- a/crates/binjs_io/src/io/statistics.rs +++ b/crates/binjs_io/src/io/statistics.rs @@ -12,6 +12,7 @@ impl std::iter::Sum for Bytes { /// A newtype for `usize` used to count the number of instances of some item. #[derive( + Constructor, Default, Display, Serialize, diff --git a/spec/context.md b/spec/context.md index fe8311f04..28f72f573 100644 --- a/spec/context.md +++ b/spec/context.md @@ -137,7 +137,8 @@ CodeTable ::= UnitCodeTable UnitCodeTable ::= 00h LiteralSymbol MultiCodeTableExplicit ::= 01h CodeCount [CodeLength]{CodeCount} [LiteralSymbol]{CodeCount} CodeLength ::= 00h .. 14h -MultiCodeTableExplicit ::= 01h [CodeLength]{SymbolCount} +CodeCount ::= Varuint +MultiCodeTableImplicit ::= 01h [CodeLength]{SymbolCount} EmptyCodeTable ::= 02h ```