From 310edea38e0acd880eb59beb6c95eb6ea2a679df Mon Sep 17 00:00:00 2001 From: Jonathan Behrens Date: Wed, 8 Nov 2023 19:00:37 -0800 Subject: [PATCH] Reduce copying and allocations (#422) This eliminate Reader::prev and adds special handling of unfiltering for the first row. --- src/decoder/mod.rs | 61 +++++++++++++++++--------------- src/filter.rs | 87 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 119 insertions(+), 29 deletions(-) diff --git a/src/decoder/mod.rs b/src/decoder/mod.rs index 29e2fd07..272e4d8c 100644 --- a/src/decoder/mod.rs +++ b/src/decoder/mod.rs @@ -210,9 +210,9 @@ impl Decoder { subframe: SubframeInfo::not_yet_init(), fctl_read: 0, next_frame: SubframeIdx::Initial, - prev: Vec::new(), - current: Vec::new(), - scan_start: 0, + data_stream: Vec::new(), + prev_start: 0, + current_start: 0, transform: self.transform, scratch_buffer: Vec::new(), limits: self.limits, @@ -347,12 +347,12 @@ pub struct Reader { /// control chunk. The IDAT image _may_ have such a chunk applying to it. fctl_read: u32, next_frame: SubframeIdx, - /// Previous raw line - prev: Vec, - /// Current raw line - current: Vec, - /// Start index of the current scan line. - scan_start: usize, + /// Vec containing the uncompressed image data currently being processed. + data_stream: Vec, + /// Index in `data_stream` where the previous row starts. + prev_start: usize, + /// Index in `data_stream` where the current row starts. + current_start: usize, /// Output transformations transform: Transformations, /// This buffer is only used so that `next_row` and `next_interlaced_row` can return reference @@ -444,8 +444,7 @@ impl Reader { return Err(DecodingError::LimitsExceeded); } - self.prev.clear(); - self.prev.resize(self.subframe.rowlen, 0); + self.prev_start = self.current_start; Ok(()) } @@ -504,8 +503,9 @@ impl Reader { line_size: self.output_line_size(self.subframe.width), }; - self.current.clear(); - self.scan_start = 0; + self.data_stream.clear(); + self.current_start = 0; + self.prev_start = 0; let width = self.info().width; if self.info().interlaced { while let Some(InterlacedRow { @@ -597,7 +597,8 @@ impl Reader { output_buffer: &mut [u8], ) -> Result<(), DecodingError> { self.next_raw_interlaced_row(rowlen)?; - let row = &self.prev[1..rowlen]; + assert_eq!(self.current_start - self.prev_start, rowlen - 1); + let row = &self.data_stream[self.prev_start..self.current_start]; // Apply transformations and write resulting data to buffer. let (color_type, bit_depth, trns) = { @@ -706,8 +707,7 @@ impl Reader { let (pass, line, width) = adam7.next()?; let rowlen = self.info().raw_row_length_from_width(width); if last_pass != pass { - self.prev.clear(); - self.prev.resize(rowlen, 0u8); + self.prev_start = self.current_start; } Some((rowlen, InterlaceInfo::Adam7 { pass, line, width })) } @@ -723,7 +723,7 @@ impl Reader { /// The scanline is filtered against the previous scanline according to the specification. fn next_raw_interlaced_row(&mut self, rowlen: usize) -> Result<(), DecodingError> { // Read image data until we have at least one full row (but possibly more than one). - while self.current.len() - self.scan_start < rowlen { + while self.data_stream.len() - self.current_start < rowlen { if self.subframe.consumed_and_flushed { return Err(DecodingError::Format( FormatErrorInner::NoMoreImageData.into(), @@ -731,19 +731,20 @@ impl Reader { } // Clear the current buffer before appending more data. - if self.scan_start > 0 { - self.current.drain(..self.scan_start).for_each(drop); - self.scan_start = 0; + if self.prev_start > 0 { + self.data_stream.drain(..self.prev_start).for_each(drop); + self.current_start -= self.prev_start; + self.prev_start = 0; } - match self.decoder.decode_next(&mut self.current)? { + match self.decoder.decode_next(&mut self.data_stream)? { Some(Decoded::ImageData) => {} Some(Decoded::ImageDataFlushed) => { self.subframe.consumed_and_flushed = true; } None => { return Err(DecodingError::Format( - if self.current.is_empty() { + if self.data_stream.is_empty() { FormatErrorInner::NoMoreImageData } else { FormatErrorInner::UnexpectedEndOfChunk @@ -756,17 +757,21 @@ impl Reader { } // Get a reference to the current row and point scan_start to the next one. - let row = &mut self.current[self.scan_start..]; - self.scan_start += rowlen; + let (prev, row) = self.data_stream.split_at_mut(self.current_start); // Unfilter the row. let filter = FilterType::from_u8(row[0]).ok_or(DecodingError::Format( FormatErrorInner::UnknownFilterMethod(row[0]).into(), ))?; - unfilter(filter, self.bpp, &self.prev[1..rowlen], &mut row[1..rowlen]); - - // Save the current row for the next pass. - self.prev[..rowlen].copy_from_slice(&row[..rowlen]); + unfilter( + filter, + self.bpp, + &prev[self.prev_start..], + &mut row[1..rowlen], + ); + + self.prev_start = self.current_start + 1; + self.current_start += rowlen; Ok(()) } diff --git a/src/filter.rs b/src/filter.rs index 22663add..39e58363 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -282,13 +282,22 @@ fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { } pub(crate) fn unfilter( - filter: FilterType, + mut filter: FilterType, tbpp: BytesPerPixel, previous: &[u8], current: &mut [u8], ) { use self::FilterType::*; + // If the previous row is empty, then treat it as if it were filled with zeros. + if previous.is_empty() { + if filter == Paeth { + filter = Sub; + } else if filter == Up { + filter = NoFilter; + } + } + // [2023/01 @okaneco] - Notes on optimizing decoding filters // // Links: @@ -452,6 +461,82 @@ pub(crate) fn unfilter( *curr = curr.wrapping_add(above); } } + Avg if previous.is_empty() => match tbpp { + BytesPerPixel::One => { + current.iter_mut().reduce(|&mut prev, curr| { + *curr = curr.wrapping_add(prev / 2); + curr + }); + } + BytesPerPixel::Two => { + let mut prev = [0; 2]; + for chunk in current.chunks_exact_mut(2) { + let new_chunk = [ + chunk[0].wrapping_add(prev[0] / 2), + chunk[1].wrapping_add(prev[1] / 2), + ]; + *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk; + prev = new_chunk; + } + } + BytesPerPixel::Three => { + let mut prev = [0; 3]; + for chunk in current.chunks_exact_mut(3) { + let new_chunk = [ + chunk[0].wrapping_add(prev[0] / 2), + chunk[1].wrapping_add(prev[1] / 2), + chunk[2].wrapping_add(prev[2] / 2), + ]; + *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; + prev = new_chunk; + } + } + BytesPerPixel::Four => { + let mut prev = [0; 4]; + for chunk in current.chunks_exact_mut(4) { + let new_chunk = [ + chunk[0].wrapping_add(prev[0] / 2), + chunk[1].wrapping_add(prev[1] / 2), + chunk[2].wrapping_add(prev[2] / 2), + chunk[3].wrapping_add(prev[3] / 2), + ]; + *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; + prev = new_chunk; + } + } + BytesPerPixel::Six => { + let mut prev = [0; 6]; + for chunk in current.chunks_exact_mut(6) { + let new_chunk = [ + chunk[0].wrapping_add(prev[0] / 2), + chunk[1].wrapping_add(prev[1] / 2), + chunk[2].wrapping_add(prev[2] / 2), + chunk[3].wrapping_add(prev[3] / 2), + chunk[4].wrapping_add(prev[4] / 2), + chunk[5].wrapping_add(prev[5] / 2), + ]; + *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; + prev = new_chunk; + } + } + BytesPerPixel::Eight => { + let mut prev = [0; 8]; + for chunk in current.chunks_exact_mut(8) { + let new_chunk = [ + chunk[0].wrapping_add(prev[0] / 2), + chunk[1].wrapping_add(prev[1] / 2), + chunk[2].wrapping_add(prev[2] / 2), + chunk[3].wrapping_add(prev[3] / 2), + chunk[4].wrapping_add(prev[4] / 2), + chunk[5].wrapping_add(prev[5] / 2), + chunk[6].wrapping_add(prev[6] / 2), + chunk[7].wrapping_add(prev[7] / 2), + ]; + *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; + prev = new_chunk; + } + } + }, Avg => match tbpp { BytesPerPixel::One => { let mut lprev = [0; 1];