Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce copying and allocations #422

Merged
merged 1 commit into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 33 additions & 28 deletions src/decoder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,9 @@ impl<R: Read> Decoder<R> {
subframe: SubframeInfo::not_yet_init(),
fctl_read: 0,
next_frame: SubframeIdx::Initial,
prev: Vec::new(),
current: Vec::new(),
scan_start: 0,
data_stream: Vec::new(),
prev_start: 0,
current_start: 0,
transform: self.transform,
scratch_buffer: Vec::new(),
limits: self.limits,
Expand Down Expand Up @@ -347,12 +347,12 @@ pub struct Reader<R: Read> {
/// control chunk. The IDAT image _may_ have such a chunk applying to it.
fctl_read: u32,
next_frame: SubframeIdx,
/// Previous raw line
prev: Vec<u8>,
/// Current raw line
current: Vec<u8>,
/// Start index of the current scan line.
scan_start: usize,
/// Vec containing the uncompressed image data currently being processed.
data_stream: Vec<u8>,
/// Index in `data_stream` where the previous row starts.
prev_start: usize,
/// Index in `data_stream` where the current row starts.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it would be useful to point out that

    1. prev_start points at pixel data - rowlen - 1 worth of bytes, but 2) current_start points filter kind byte + pixel data (potentially incomplete - at most rowlen worth of bytes).
  • prev_start == current_start indicates that there is no previous row

current_start: usize,
/// Output transformations
transform: Transformations,
/// This buffer is only used so that `next_row` and `next_interlaced_row` can return reference
Expand Down Expand Up @@ -444,8 +444,7 @@ impl<R: Read> Reader<R> {
return Err(DecodingError::LimitsExceeded);
}

self.prev.clear();
self.prev.resize(self.subframe.rowlen, 0);
self.prev_start = self.current_start;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

optional, feel free to ignore: There is minimal code duplication here (e.g. new line 710 looks the same) + I wonder if the code would be more readable if this single line was hoisted into a well-named separate function (maybe reset_prev_row)?


Ok(())
}
Expand Down Expand Up @@ -504,8 +503,9 @@ impl<R: Read> Reader<R> {
line_size: self.output_line_size(self.subframe.width),
};

self.current.clear();
self.scan_start = 0;
self.data_stream.clear();
self.current_start = 0;
self.prev_start = 0;
let width = self.info().width;
if self.info().interlaced {
while let Some(InterlacedRow {
Expand Down Expand Up @@ -597,7 +597,8 @@ impl<R: Read> Reader<R> {
output_buffer: &mut [u8],
) -> Result<(), DecodingError> {
self.next_raw_interlaced_row(rowlen)?;
let row = &self.prev[1..rowlen];
assert_eq!(self.current_start - self.prev_start, rowlen - 1);
let row = &self.data_stream[self.prev_start..self.current_start];

// Apply transformations and write resulting data to buffer.
let (color_type, bit_depth, trns) = {
Expand Down Expand Up @@ -706,8 +707,7 @@ impl<R: Read> Reader<R> {
let (pass, line, width) = adam7.next()?;
let rowlen = self.info().raw_row_length_from_width(width);
if last_pass != pass {
self.prev.clear();
self.prev.resize(rowlen, 0u8);
self.prev_start = self.current_start;
}
Some((rowlen, InterlaceInfo::Adam7 { pass, line, width }))
}
Expand All @@ -723,27 +723,28 @@ impl<R: Read> Reader<R> {
/// The scanline is filtered against the previous scanline according to the specification.
fn next_raw_interlaced_row(&mut self, rowlen: usize) -> Result<(), DecodingError> {
// Read image data until we have at least one full row (but possibly more than one).
while self.current.len() - self.scan_start < rowlen {
while self.data_stream.len() - self.current_start < rowlen {
if self.subframe.consumed_and_flushed {
return Err(DecodingError::Format(
FormatErrorInner::NoMoreImageData.into(),
));
}

// Clear the current buffer before appending more data.
if self.scan_start > 0 {
self.current.drain(..self.scan_start).for_each(drop);
self.scan_start = 0;
if self.prev_start > 0 {
self.data_stream.drain(..self.prev_start).for_each(drop);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The drain / for_each(drop) approach seems complex - I wonder if the compiler is able to optimize this as well as a more direct move - e.g. something like:

            self.data_stream.copy_within(self.prev_start.., 0);
            self.data_stream.truncate(self.data_stream.len() - self.current_start);

I can't really read assembly well, but the drain approach seems to at least result in longer assembly: https://godbolt.org/z/z48q1xej5

OTOH, maybe this feedback doesn't belong to this PR and we should fix this in a separate PR? (e.g. drain-based compaction is also used in zlib.rs)

self.current_start -= self.prev_start;
self.prev_start = 0;
}

match self.decoder.decode_next(&mut self.current)? {
match self.decoder.decode_next(&mut self.data_stream)? {
Some(Decoded::ImageData) => {}
Some(Decoded::ImageDataFlushed) => {
self.subframe.consumed_and_flushed = true;
}
None => {
return Err(DecodingError::Format(
if self.current.is_empty() {
if self.data_stream.is_empty() {
FormatErrorInner::NoMoreImageData
} else {
FormatErrorInner::UnexpectedEndOfChunk
Expand All @@ -756,17 +757,21 @@ impl<R: Read> Reader<R> {
}

// Get a reference to the current row and point scan_start to the next one.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Probably need to remove the remaining reference to scan_start here

let row = &mut self.current[self.scan_start..];
self.scan_start += rowlen;
let (prev, row) = self.data_stream.split_at_mut(self.current_start);

// Unfilter the row.
let filter = FilterType::from_u8(row[0]).ok_or(DecodingError::Format(
FormatErrorInner::UnknownFilterMethod(row[0]).into(),
))?;
unfilter(filter, self.bpp, &self.prev[1..rowlen], &mut row[1..rowlen]);

// Save the current row for the next pass.
self.prev[..rowlen].copy_from_slice(&row[..rowlen]);
unfilter(
filter,
self.bpp,
&prev[self.prev_start..],
&mut row[1..rowlen],
);

self.prev_start = self.current_start + 1;
self.current_start += rowlen;

Ok(())
}
Expand Down
87 changes: 86 additions & 1 deletion src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,13 +282,22 @@ fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
}

pub(crate) fn unfilter(
filter: FilterType,
mut filter: FilterType,
tbpp: BytesPerPixel,
previous: &[u8],
current: &mut [u8],
) {
use self::FilterType::*;

// If the previous row is empty, then treat it as if it were filled with zeros.
if previous.is_empty() {
if filter == Paeth {
filter = Sub;
} else if filter == Up {
filter = NoFilter;
}
}

// [2023/01 @okaneco] - Notes on optimizing decoding filters
//
// Links:
Expand Down Expand Up @@ -452,6 +461,82 @@ pub(crate) fn unfilter(
*curr = curr.wrapping_add(above);
}
}
Avg if previous.is_empty() => match tbpp {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if roundtrip_ascending_previous_line and roundtrip tests (below in the same source file) could be duplicated and/or tweaked to explicitly cover round-tripping when there is no previous row?

BytesPerPixel::One => {
current.iter_mut().reduce(|&mut prev, curr| {
*curr = curr.wrapping_add(prev / 2);
curr
});
}
BytesPerPixel::Two => {
let mut prev = [0; 2];
for chunk in current.chunks_exact_mut(2) {
let new_chunk = [
chunk[0].wrapping_add(prev[0] / 2),
chunk[1].wrapping_add(prev[1] / 2),
];
*TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
prev = new_chunk;
}
}
BytesPerPixel::Three => {
let mut prev = [0; 3];
for chunk in current.chunks_exact_mut(3) {
let new_chunk = [
chunk[0].wrapping_add(prev[0] / 2),
chunk[1].wrapping_add(prev[1] / 2),
chunk[2].wrapping_add(prev[2] / 2),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
prev = new_chunk;
}
}
BytesPerPixel::Four => {
let mut prev = [0; 4];
for chunk in current.chunks_exact_mut(4) {
let new_chunk = [
chunk[0].wrapping_add(prev[0] / 2),
chunk[1].wrapping_add(prev[1] / 2),
chunk[2].wrapping_add(prev[2] / 2),
chunk[3].wrapping_add(prev[3] / 2),
];
*TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
prev = new_chunk;
}
}
BytesPerPixel::Six => {
let mut prev = [0; 6];
for chunk in current.chunks_exact_mut(6) {
let new_chunk = [
chunk[0].wrapping_add(prev[0] / 2),
chunk[1].wrapping_add(prev[1] / 2),
chunk[2].wrapping_add(prev[2] / 2),
chunk[3].wrapping_add(prev[3] / 2),
chunk[4].wrapping_add(prev[4] / 2),
chunk[5].wrapping_add(prev[5] / 2),
];
*TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
prev = new_chunk;
}
}
BytesPerPixel::Eight => {
let mut prev = [0; 8];
for chunk in current.chunks_exact_mut(8) {
let new_chunk = [
chunk[0].wrapping_add(prev[0] / 2),
chunk[1].wrapping_add(prev[1] / 2),
chunk[2].wrapping_add(prev[2] / 2),
chunk[3].wrapping_add(prev[3] / 2),
chunk[4].wrapping_add(prev[4] / 2),
chunk[5].wrapping_add(prev[5] / 2),
chunk[6].wrapping_add(prev[6] / 2),
chunk[7].wrapping_add(prev[7] / 2),
];
*TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
prev = new_chunk;
}
}
},
Avg => match tbpp {
BytesPerPixel::One => {
let mut lprev = [0; 1];
Expand Down
Loading