Skip to content

Commit

Permalink
Merge pull request #512 from Shnatsel/autovec-paeth-but-simd
Browse files Browse the repository at this point in the history
Replace handwritten SIMD implementation with autovectorization for +10% perf
  • Loading branch information
Shnatsel authored Oct 5, 2024
2 parents 272ae60 + f1b75ae commit 3fbbbb1
Showing 1 changed file with 48 additions and 29 deletions.
77 changes: 48 additions & 29 deletions src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ mod simd {
use std::simd::num::{SimdInt, SimdUint};
use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount};

/// This is an equivalent of the `PaethPredictor` function from
/// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
/// except that it simultaneously calculates the predictor for all SIMD lanes.
/// Mapping between parameter names and pixel positions can be found in
/// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
/// Scalar Paeth function wrapped in SIMD scaffolding.
///
/// Examples of how different pixel types may be represented as multiple SIMD lanes:
/// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
/// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
/// This is needed because simply running the function on the inputs
/// makes the compiler think our inputs are too short
/// to benefit from vectorization.
/// Putting it in SIMD scaffolding fixes that.
/// https://github.com/image-rs/image-png/issues/511
///
/// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
/// Funnily, the autovectorizer does a better job here
/// than a handwritten algorithm using std::simd!
/// We used to have a handwritten one but this is just faster.
fn paeth_predictor<const N: usize>(
a: Simd<i16, N>,
b: Simd<i16, N>,
Expand All @@ -32,28 +32,26 @@ mod simd {
where
LaneCount<N>: SupportedLaneCount,
{
let pa = b - c; // (p-a) == (a+b-c - a) == (b-c)
let pb = a - c; // (p-b) == (a+b-c - b) == (a-c)
let pc = pa + pb; // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)

let pa = pa.abs();
let pb = pb.abs();
let pc = pc.abs();

let smallest = pc.simd_min(pa.simd_min(pb));

// Paeth algorithm breaks ties favoring a over b over c, so we execute the following
// lane-wise selection:
//
// if smalest == pa
// then select a
// else select (if smallest == pb then select b else select c)
smallest
.simd_eq(pa)
.select(a, smallest.simd_eq(pb).select(b, c))
let mut out = [0; N];
for i in 0..N {
out[i] = super::filter_paeth_decode_i16(a[i].into(), b[i].into(), c[i].into());
}
out.into()
}

/// Equivalent to `simd::paeth_predictor` but does not temporarily convert
/// This is an equivalent of the `PaethPredictor` function from
/// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
/// except that it simultaneously calculates the predictor for all SIMD lanes.
/// Mapping between parameter names and pixel positions can be found in
/// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
///
/// Examples of how different pixel types may be represented as multiple SIMD lanes:
/// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
/// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
///
/// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
///
/// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert
/// the SIMD elements to `i16`.
fn paeth_predictor_u8<const N: usize>(
a: Simd<u8, N>,
Expand Down Expand Up @@ -340,6 +338,27 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
out
}

#[cfg(feature = "unstable")]
fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
// Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD
let pa = (b - c).abs();
let pb = (a - c).abs();
let pc = ((a - c) + (b - c)).abs();

let mut out = a;
let mut min = pa;

if pb < min {
min = pb;
out = b;
}
if pc < min {
out = c;
}

out
}

fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
// This is an optimized version of the paeth filter from the PNG specification, proposed by
// Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates
Expand Down

0 comments on commit 3fbbbb1

Please sign in to comment.