From 21b475c22b1410e91ecb687f413e4411234a8454 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 1 Dec 2024 07:42:11 +0000 Subject: [PATCH 01/16] Initial port of stb_image PNG paeth unfiltering --- src/filter.rs | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index 444a14d6..b17c187a 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -319,23 +319,19 @@ impl Default for AdaptiveFilterType { } fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { - // Decoding seems to optimize better with this algorithm - let pa = (i16::from(b) - i16::from(c)).abs(); - let pb = (i16::from(a) - i16::from(c)).abs(); - let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs(); - - let mut out = a; - let mut min = pa; - - if pb < min { - min = pb; - out = b; - } - if pc < min { - out = c; - } - - out + // This formulation looks very different from the reference in the PNG spec, but is + // actually equivalent and has favorable data dependencies and admits straightforward + // generation of branch-free code, which helps performance significantly. + // + // Adapted from public domain PNG implementation: + // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668 + let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b)); + let thresh = thresh as u8; + let lo = a.min(b); + let hi = a.max(b); + let t0 = if hi <= thresh { lo } else { c }; + let t1 = if thresh <= lo { hi } else { t0 }; + return t1; } #[cfg(feature = "unstable")] From 14c448d14d05f80f2bb35a967653299b097c9e7e Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 1 Dec 2024 08:10:04 +0000 Subject: [PATCH 02/16] Sneaky fix, you saw nothing --- src/filter.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index b17c187a..a14ff758 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -325,13 +325,16 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { // // Adapted from public domain PNG implementation: // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668 - let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b)); - let thresh = thresh as u8; + let a = i16::from(a); + let b = i16::from(b); + let c = i16::from(c); + + let thresh = c * 3 - (a + b); let lo = a.min(b); let hi = a.max(b); let t0 = if hi <= thresh { lo } else { c }; let t1 = if thresh <= lo { hi } else { t0 }; - return t1; + return t1 as u8; } #[cfg(feature = "unstable")] From 5b19c5bc96f83fd8ddf00cced8face34e4929d86 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 1 Dec 2024 08:12:33 +0000 Subject: [PATCH 03/16] Don't widen types where it's not necessary --- src/filter.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index a14ff758..00ca5b2d 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -325,16 +325,12 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { // // Adapted from public domain PNG implementation: // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668 - let a = i16::from(a); - let b = i16::from(b); - let c = i16::from(c); - - let thresh = c * 3 - (a + b); + let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b)); let lo = a.min(b); let hi = a.max(b); - let t0 = if hi <= thresh { lo } else { c }; - let t1 = if thresh <= lo { hi } else { t0 }; - return t1 as u8; + let t0 = if hi as i16 <= thresh { lo } else { c }; + let t1 = if thresh <= lo as i16 { hi } else { t0 }; + return t1; } #[cfg(feature = "unstable")] From 7bfaa4949a159b4f1fed840fd89b5fb3f2808eb1 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 1 Dec 2024 08:17:14 +0000 Subject: [PATCH 04/16] Restore comment --- src/filter.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/filter.rs b/src/filter.rs index 00ca5b2d..856008ec 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -319,6 +319,8 @@ impl Default for AdaptiveFilterType { } fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { + // Decoding optimizes better with this algorithm than with `filter_paeth()` + // // This formulation looks very different from the reference in the PNG spec, but is // actually equivalent and has favorable data dependencies and admits straightforward // generation of branch-free code, which helps performance significantly. From a557386dec65099c7ae3c9a5694fb42baf9b354d Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 1 Dec 2024 08:31:49 +0000 Subject: [PATCH 05/16] Add an equivalence test for the Paeth implementations --- src/filter.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/filter.rs b/src/filter.rs index 856008ec..c56ff84a 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -1157,6 +1157,18 @@ mod test { } } + #[test] + fn paeth_impls_are_equivalent() { + use super::{filter_paeth, filter_paeth_decode}; + for a in 0..=255 { + for b in 0..=255 { + for c in 0..=255 { + assert_eq!(filter_paeth(a, b, c), filter_paeth_decode(a, b, c)); + } + } + } + } + #[test] fn roundtrip_ascending_previous_line() { // A multiple of 8, 6, 4, 3, 2, 1 From f30ec3c3e29b4c2e222e07517410573942c804fc Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 1 Dec 2024 20:10:01 +0000 Subject: [PATCH 06/16] Convert nightly-only portable SIMD path to the stb_image Paeth algorithm --- src/filter.rs | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index c56ff84a..8e181576 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -338,22 +338,12 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { #[cfg(feature = "unstable")] fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 { // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD - let pa = (b - c).abs(); - let pb = (a - c).abs(); - let pc = ((a - c) + (b - c)).abs(); - - let mut out = a; - let mut min = pa; - - if pb < min { - min = pb; - out = b; - } - if pc < min { - out = c; - } - - out + let thresh = c * 3 - (a + b); + let lo = a.min(b); + let hi = a.max(b); + let t0 = if hi <= thresh { lo } else { c }; + let t1 = if thresh <= lo { hi } else { t0 }; + return t1; } fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { From 21c5b4e5b7d547d086bb1805e3b63aef40f76c64 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 1 Dec 2024 20:36:35 +0000 Subject: [PATCH 07/16] Drop the explicit SIMD Paeth algorithm, it's faster to use an autovectorized version of the scalar one --- src/filter.rs | 45 +++++---------------------------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index 8e181576..ef170f8a 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -41,7 +41,7 @@ mod simd { /// This is an equivalent of the `PaethPredictor` function from /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth) - /// except that it simultaneously calculates the predictor for all SIMD lanes. + /// /// Mapping between parameter names and pixel positions can be found in /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions). /// @@ -49,8 +49,6 @@ mod simd { /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A /// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value /// - /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280). - /// /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert /// the SIMD elements to `i16`. fn paeth_predictor_u8( @@ -61,44 +59,11 @@ mod simd { where LaneCount: SupportedLaneCount, { - // Calculates the absolute difference between `a` and `b`. - fn abs_diff_simd(a: Simd, b: Simd) -> Simd - where - LaneCount: SupportedLaneCount, - { - a.simd_max(b) - b.simd_min(a) + let mut out = [0; N]; + for i in 0..N { + out[i] = super::filter_paeth_decode(a[i].into(), b[i].into(), c[i].into()); } - - // Uses logic from `filter::filter_paeth` to calculate absolute values - // entirely in `Simd`. This method avoids unpacking and packing - // penalties resulting from conversion to and from `Simd`. - // ``` - // let pa = b.max(c) - c.min(b); - // let pb = a.max(c) - c.min(a); - // let pc = if (a < c) == (c < b) { - // pa.max(pb) - pa.min(pb) - // } else { - // 255 - // }; - // ``` - let pa = abs_diff_simd(b, c); - let pb = abs_diff_simd(a, c); - let pc = a - .simd_lt(c) - .simd_eq(c.simd_lt(b)) - .select(abs_diff_simd(pa, pb), Simd::splat(255)); - - let smallest = pc.simd_min(pa.simd_min(pb)); - - // Paeth algorithm breaks ties favoring a over b over c, so we execute the following - // lane-wise selection: - // - // if smalest == pa - // then select a - // else select (if smallest == pb then select b else select c) - smallest - .simd_eq(pa) - .select(a, smallest.simd_eq(pb).select(b, c)) + out.into() } /// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`). From e0b19538147ca5ef0e36a2393730f40b9929bdb1 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 1 Dec 2024 20:42:30 +0000 Subject: [PATCH 08/16] cargo fmt --- src/filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/filter.rs b/src/filter.rs index ef170f8a..f5bb959e 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -41,7 +41,7 @@ mod simd { /// This is an equivalent of the `PaethPredictor` function from /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth) - /// + /// /// Mapping between parameter names and pixel positions can be found in /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions). /// From 9ed0d9e4682383506b1c48fe7a80fde784457373 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 1 Dec 2024 21:04:23 +0000 Subject: [PATCH 09/16] ignore the paeth equivalence test by default to avoid inflating CI times --- src/filter.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/filter.rs b/src/filter.rs index f5bb959e..87a30051 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -1113,6 +1113,7 @@ mod test { } #[test] + #[ignore] // takes ~20s without optimizations fn paeth_impls_are_equivalent() { use super::{filter_paeth, filter_paeth_decode}; for a in 0..=255 { From 3e083e9278d847c6a3724b5448282c3f4c6232d7 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 2 Dec 2024 02:52:20 +0000 Subject: [PATCH 10/16] Use the previous Paeth impl on non-x86 platforms --- src/filter.rs | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/src/filter.rs b/src/filter.rs index 87a30051..a0a3424b 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -283,6 +283,7 @@ impl Default for AdaptiveFilterType { } } +#[cfg(target_arch = "x86_64")] fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { // Decoding optimizes better with this algorithm than with `filter_paeth()` // @@ -300,7 +301,7 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { return t1; } -#[cfg(feature = "unstable")] +#[cfg(all(feature = "unstable", target_arch = "x86_64"))] fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 { // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD let thresh = c * 3 - (a + b); @@ -311,6 +312,48 @@ fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 { return t1; } +#[cfg(not(target_arch = "x86_64"))] +fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { + // Decoding seems to optimize better with this algorithm + let pa = (i16::from(b) - i16::from(c)).abs(); + let pb = (i16::from(a) - i16::from(c)).abs(); + let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs(); + + let mut out = a; + let mut min = pa; + + if pb < min { + min = pb; + out = b; + } + if pc < min { + out = c; + } + + out +} + +#[cfg(all(feature = "unstable", not(target_arch = "x86_64")))] +fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 { + // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD + let pa = (b - c).abs(); + let pb = (a - c).abs(); + let pc = ((a - c) + (b - c)).abs(); + + let mut out = a; + let mut min = pa; + + if pb < min { + min = pb; + out = b; + } + if pc < min { + out = c; + } + + out +} + fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { // This is an optimized version of the paeth filter from the PNG specification, proposed by // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates From bb16b836bcacbdbf34a481f217b6ee3a9eb233e7 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 2 Dec 2024 20:34:48 +0000 Subject: [PATCH 11/16] Drop outdated comment --- src/filter.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index a0a3424b..2293f732 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -39,16 +39,6 @@ mod simd { out.into() } - /// This is an equivalent of the `PaethPredictor` function from - /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth) - /// - /// Mapping between parameter names and pixel positions can be found in - /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions). - /// - /// Examples of how different pixel types may be represented as multiple SIMD lanes: - /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A - /// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value - /// /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert /// the SIMD elements to `i16`. fn paeth_predictor_u8( From c97f54c35a8fe0821587720aba3fe79dfd0772e4 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 2 Dec 2024 20:51:52 +0000 Subject: [PATCH 12/16] Disable unstable bpp=3 and bpp=6 codepaths on non-x86, add comments --- src/filter.rs | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index 2293f732..db02f26e 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -293,7 +293,8 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { #[cfg(all(feature = "unstable", target_arch = "x86_64"))] fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 { - // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD + // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD types. + // Used for bpp=3 and bpp=6 let thresh = c * 3 - (a + b); let lo = a.min(b); let hi = a.max(b); @@ -304,7 +305,9 @@ fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 { #[cfg(not(target_arch = "x86_64"))] fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { - // Decoding seems to optimize better with this algorithm + // On ARM this algorithm performs much better than the one above adapted from stb, + // and this is the better-studied algorithm we've always used here, + // so we default to it on all non-x86 platforms. let pa = (i16::from(b) - i16::from(c)).abs(); let pb = (i16::from(a) - i16::from(c)).abs(); let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs(); @@ -323,27 +326,6 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { out } -#[cfg(all(feature = "unstable", not(target_arch = "x86_64")))] -fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 { - // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD - let pa = (b - c).abs(); - let pb = (a - c).abs(); - let pc = ((a - c) + (b - c)).abs(); - - let mut out = a; - let mut min = pa; - - if pb < min { - min = pb; - out = b; - } - if pc < min { - out = c; - } - - out -} - fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { // This is an optimized version of the paeth filter from the PNG specification, proposed by // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates @@ -754,7 +736,8 @@ pub(crate) fn unfilter( } } BytesPerPixel::Three => { - #[cfg(feature = "unstable")] + // Do not enable this algorithm on ARM, that would be a big performance hit + #[cfg(all(feature = "unstable", target_arch = "x86_64"))] simd::unfilter_paeth3(previous, current); #[cfg(not(feature = "unstable"))] @@ -813,7 +796,8 @@ pub(crate) fn unfilter( } } BytesPerPixel::Six => { - #[cfg(feature = "unstable")] + // Doesn't help performance on ARM, so not enabled to reduce on complexity + #[cfg(all(feature = "unstable", target_arch = "x86_64"))] simd::unfilter_paeth6(previous, current); #[cfg(not(feature = "unstable"))] From 5c2538cdeb847287a4251bcb54c8693319970fdd Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 2 Dec 2024 21:01:32 +0000 Subject: [PATCH 13/16] Only build Paeth bpp=3 and bpp=6 routines on x86; fixes build on ARM --- src/filter.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/filter.rs b/src/filter.rs index db02f26e..e03f015a 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -9,6 +9,8 @@ use crate::common::BytesPerPixel; /// feature of Rust gets stabilized. #[cfg(feature = "unstable")] mod simd { + // unused imports may ocur in this module due to conditional compilation + #![allow(unused_imports)] use std::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd}; use std::simd::num::{SimdInt, SimdUint}; use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount}; @@ -24,6 +26,7 @@ mod simd { /// Funnily, the autovectorizer does a better job here /// than a handwritten algorithm using std::simd! /// We used to have a handwritten one but this is just faster. + #[cfg(target_arch = "x86_64")] fn paeth_predictor( a: Simd, b: Simd, @@ -75,6 +78,7 @@ mod simd { /// /// `b` is the current pixel in the previous row. `x` is the current pixel in the current row. /// See also https://www.w3.org/TR/png/#filter-byte-positions + #[cfg(target_arch = "x86_64")] fn paeth_step( state: &mut PaethState, b: Simd, @@ -112,15 +116,18 @@ mod simd { state.a = *x; } + #[cfg(target_arch = "x86_64")] fn load3(src: &[u8]) -> u8x4 { u8x4::from_array([src[0], src[1], src[2], 0]) } + #[cfg(target_arch = "x86_64")] fn store3(src: u8x4, dest: &mut [u8]) { dest[0..3].copy_from_slice(&src.to_array()[0..3]) } /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`. + #[cfg(target_arch = "x86_64")] pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) { debug_assert_eq!(prev_row.len(), curr_row.len()); debug_assert_eq!(prev_row.len() % 3, 0); @@ -175,15 +182,18 @@ mod simd { } } + #[cfg(target_arch = "x86_64")] fn load6(src: &[u8]) -> u8x8 { u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0]) } + #[cfg(target_arch = "x86_64")] fn store6(src: u8x8, dest: &mut [u8]) { dest[0..6].copy_from_slice(&src.to_array()[0..6]) } /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`. + #[cfg(target_arch = "x86_64")] pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) { debug_assert_eq!(prev_row.len(), curr_row.len()); debug_assert_eq!(prev_row.len() % 6, 0); From d5e97419ef2bd944df7ae307df5a11c1a582568d Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 2 Dec 2024 21:16:36 +0000 Subject: [PATCH 14/16] Run tests on ARM on CI --- .github/workflows/rust.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 09f514d9..07fdcdd8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -54,7 +54,10 @@ jobs: rustup target add powerpc-unknown-linux-gnu cargo build --target powerpc-unknown-linux-gnu test_all: - runs-on: ubuntu-latest + strategy: + matrix: + os: [ubuntu-latest, macos-latest] # macos-latest is ARM + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - run: rustup default stable From 54d2b7abbc990029b58f1f9f7769982db2cc5346 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 2 Dec 2024 21:27:13 +0000 Subject: [PATCH 15/16] Run cargo check --features=unstable on ARM too --- .github/workflows/rust.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 07fdcdd8..6a04428f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -32,8 +32,9 @@ jobs: feature_check: strategy: matrix: - features: ["", "benchmarks"] - runs-on: ubuntu-latest + features: ["", "unstable", "benchmarks"] + os: [ubuntu-latest, macos-latest] # macos-latest is ARM + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - uses: actions-rs/toolchain@v1 From 95fabd4a83d678b116c20d8bc36938d010f0b5a8 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 2 Dec 2024 21:58:32 +0000 Subject: [PATCH 16/16] Gate the entire SIMD module on x86_64 to reduce the amount of possible configurations in conditional compilation --- src/filter.rs | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index e03f015a..aa8c4aa4 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -7,11 +7,13 @@ use crate::common::BytesPerPixel; /// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the /// "unstable" feature of the `png` crate. This should be possible once the "portable_simd" /// feature of Rust gets stabilized. -#[cfg(feature = "unstable")] +/// +/// This is only known to help on x86, with no change measured on most benchmarks on ARM, +/// and even severely regressing some of them. +/// So despite the code being portable, we only enable this for x86. +/// We can add more platforms once this code is proven to be beneficial for them. +#[cfg(all(feature = "unstable", target_arch = "x86_64"))] mod simd { - // unused imports may ocur in this module due to conditional compilation - #![allow(unused_imports)] - use std::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd}; use std::simd::num::{SimdInt, SimdUint}; use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount}; @@ -26,7 +28,6 @@ mod simd { /// Funnily, the autovectorizer does a better job here /// than a handwritten algorithm using std::simd! /// We used to have a handwritten one but this is just faster. - #[cfg(target_arch = "x86_64")] fn paeth_predictor( a: Simd, b: Simd, @@ -78,7 +79,6 @@ mod simd { /// /// `b` is the current pixel in the previous row. `x` is the current pixel in the current row. /// See also https://www.w3.org/TR/png/#filter-byte-positions - #[cfg(target_arch = "x86_64")] fn paeth_step( state: &mut PaethState, b: Simd, @@ -116,18 +116,15 @@ mod simd { state.a = *x; } - #[cfg(target_arch = "x86_64")] fn load3(src: &[u8]) -> u8x4 { u8x4::from_array([src[0], src[1], src[2], 0]) } - #[cfg(target_arch = "x86_64")] fn store3(src: u8x4, dest: &mut [u8]) { dest[0..3].copy_from_slice(&src.to_array()[0..3]) } /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`. - #[cfg(target_arch = "x86_64")] pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) { debug_assert_eq!(prev_row.len(), curr_row.len()); debug_assert_eq!(prev_row.len() % 3, 0); @@ -182,18 +179,15 @@ mod simd { } } - #[cfg(target_arch = "x86_64")] fn load6(src: &[u8]) -> u8x8 { u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0]) } - #[cfg(target_arch = "x86_64")] fn store6(src: u8x8, dest: &mut [u8]) { dest[0..6].copy_from_slice(&src.to_array()[0..6]) } /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`. - #[cfg(target_arch = "x86_64")] pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) { debug_assert_eq!(prev_row.len(), curr_row.len()); debug_assert_eq!(prev_row.len() % 6, 0); @@ -775,7 +769,7 @@ pub(crate) fn unfilter( } } BytesPerPixel::Four => { - #[cfg(feature = "unstable")] + #[cfg(all(feature = "unstable", target_arch = "x86_64"))] simd::unfilter_paeth_u8::<4>(previous, current); #[cfg(not(feature = "unstable"))] @@ -806,7 +800,6 @@ pub(crate) fn unfilter( } } BytesPerPixel::Six => { - // Doesn't help performance on ARM, so not enabled to reduce on complexity #[cfg(all(feature = "unstable", target_arch = "x86_64"))] simd::unfilter_paeth6(previous, current); @@ -844,7 +837,7 @@ pub(crate) fn unfilter( } } BytesPerPixel::Eight => { - #[cfg(feature = "unstable")] + #[cfg(all(feature = "unstable", target_arch = "x86_64"))] simd::unfilter_paeth_u8::<8>(previous, current); #[cfg(not(feature = "unstable"))]