From 21b475c22b1410e91ecb687f413e4411234a8454 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 1 Dec 2024 07:42:11 +0000
Subject: [PATCH 01/16] Initial port of stb_image PNG paeth unfiltering

---
 src/filter.rs | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/filter.rs b/src/filter.rs
index 444a14d6..b17c187a 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -319,23 +319,19 @@ impl Default for AdaptiveFilterType {
 }
 
 fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
-    // Decoding seems to optimize better with this algorithm
-    let pa = (i16::from(b) - i16::from(c)).abs();
-    let pb = (i16::from(a) - i16::from(c)).abs();
-    let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
-
-    let mut out = a;
-    let mut min = pa;
-
-    if pb < min {
-        min = pb;
-        out = b;
-    }
-    if pc < min {
-        out = c;
-    }
-
-    out
+    // This formulation looks very different from the reference in the PNG spec, but is
+    // actually equivalent and has favorable data dependencies and admits straightforward
+    // generation of branch-free code, which helps performance significantly.
+    //
+    // Adapted from public domain PNG implementation:
+    // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
+    let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
+    let thresh = thresh as u8;
+    let lo = a.min(b);
+    let hi = a.max(b);
+    let t0 = if hi <= thresh { lo } else { c };
+    let t1 = if thresh <= lo { hi } else { t0 };
+    return t1;
 }
 
 #[cfg(feature = "unstable")]

From 14c448d14d05f80f2bb35a967653299b097c9e7e Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 1 Dec 2024 08:10:04 +0000
Subject: [PATCH 02/16] Sneaky fix, you saw nothing

---
 src/filter.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/filter.rs b/src/filter.rs
index b17c187a..a14ff758 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -325,13 +325,16 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
     //
     // Adapted from public domain PNG implementation:
     // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
-    let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
-    let thresh = thresh as u8;
+    let a = i16::from(a);
+    let b = i16::from(b);
+    let c = i16::from(c);
+
+    let thresh = c * 3 - (a + b);
     let lo = a.min(b);
     let hi = a.max(b);
     let t0 = if hi <= thresh { lo } else { c };
     let t1 = if thresh <= lo { hi } else { t0 };
-    return t1;
+    return t1 as u8;
 }
 
 #[cfg(feature = "unstable")]

From 5b19c5bc96f83fd8ddf00cced8face34e4929d86 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 1 Dec 2024 08:12:33 +0000
Subject: [PATCH 03/16] Don't widen types where it's not necessary

---
 src/filter.rs | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/filter.rs b/src/filter.rs
index a14ff758..00ca5b2d 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -325,16 +325,12 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
     //
     // Adapted from public domain PNG implementation:
     // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
-    let a = i16::from(a);
-    let b = i16::from(b);
-    let c = i16::from(c);
-
-    let thresh = c * 3 - (a + b);
+    let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
     let lo = a.min(b);
     let hi = a.max(b);
-    let t0 = if hi <= thresh { lo } else { c };
-    let t1 = if thresh <= lo { hi } else { t0 };
-    return t1 as u8;
+    let t0 = if hi as i16 <= thresh { lo } else { c };
+    let t1 = if thresh <= lo as i16 { hi } else { t0 };
+    return t1;
 }
 
 #[cfg(feature = "unstable")]

From 7bfaa4949a159b4f1fed840fd89b5fb3f2808eb1 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 1 Dec 2024 08:17:14 +0000
Subject: [PATCH 04/16] Restore comment

---
 src/filter.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/filter.rs b/src/filter.rs
index 00ca5b2d..856008ec 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -319,6 +319,8 @@ impl Default for AdaptiveFilterType {
 }
 
 fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
+    // Decoding optimizes better with this algorithm than with `filter_paeth()`
+    //
     // This formulation looks very different from the reference in the PNG spec, but is
     // actually equivalent and has favorable data dependencies and admits straightforward
     // generation of branch-free code, which helps performance significantly.

From a557386dec65099c7ae3c9a5694fb42baf9b354d Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 1 Dec 2024 08:31:49 +0000
Subject: [PATCH 05/16] Add an equivalence test for the Paeth implementations

---
 src/filter.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/filter.rs b/src/filter.rs
index 856008ec..c56ff84a 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -1157,6 +1157,18 @@ mod test {
         }
     }
 
+    #[test]
+    fn paeth_impls_are_equivalent() {
+        use super::{filter_paeth, filter_paeth_decode};
+        for a in 0..=255 {
+            for b in 0..=255 {
+                for c in 0..=255 {
+                    assert_eq!(filter_paeth(a, b, c), filter_paeth_decode(a, b, c));
+                }
+            }
+        }
+    }
+
     #[test]
     fn roundtrip_ascending_previous_line() {
         // A multiple of 8, 6, 4, 3, 2, 1

From f30ec3c3e29b4c2e222e07517410573942c804fc Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 1 Dec 2024 20:10:01 +0000
Subject: [PATCH 06/16] Convert nightly-only portable SIMD path to the
 stb_image Paeth algorithm

---
 src/filter.rs | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/src/filter.rs b/src/filter.rs
index c56ff84a..8e181576 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -338,22 +338,12 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
 #[cfg(feature = "unstable")]
 fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
     // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD
-    let pa = (b - c).abs();
-    let pb = (a - c).abs();
-    let pc = ((a - c) + (b - c)).abs();
-
-    let mut out = a;
-    let mut min = pa;
-
-    if pb < min {
-        min = pb;
-        out = b;
-    }
-    if pc < min {
-        out = c;
-    }
-
-    out
+    let thresh = c * 3 - (a + b);
+    let lo = a.min(b);
+    let hi = a.max(b);
+    let t0 = if hi <= thresh { lo } else { c };
+    let t1 = if thresh <= lo { hi } else { t0 };
+    return t1;
 }
 
 fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {

From 21c5b4e5b7d547d086bb1805e3b63aef40f76c64 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 1 Dec 2024 20:36:35 +0000
Subject: [PATCH 07/16] Drop the explicit SIMD Paeth algorithm, it's faster to
 use an autovectorized version of the scalar one

---
 src/filter.rs | 45 +++++----------------------------------------
 1 file changed, 5 insertions(+), 40 deletions(-)

diff --git a/src/filter.rs b/src/filter.rs
index 8e181576..ef170f8a 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -41,7 +41,7 @@ mod simd {
 
     /// This is an equivalent of the `PaethPredictor` function from
     /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
-    /// except that it simultaneously calculates the predictor for all SIMD lanes.
+    /// 
     /// Mapping between parameter names and pixel positions can be found in
     /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
     ///
@@ -49,8 +49,6 @@ mod simd {
     /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
     /// - RGB  => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
     ///
-    /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
-    ///
     /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert
     /// the SIMD elements to `i16`.
     fn paeth_predictor_u8<const N: usize>(
@@ -61,44 +59,11 @@ mod simd {
     where
         LaneCount<N>: SupportedLaneCount,
     {
-        // Calculates the absolute difference between `a` and `b`.
-        fn abs_diff_simd<const N: usize>(a: Simd<u8, N>, b: Simd<u8, N>) -> Simd<u8, N>
-        where
-            LaneCount<N>: SupportedLaneCount,
-        {
-            a.simd_max(b) - b.simd_min(a)
+        let mut out = [0; N];
+        for i in 0..N {
+            out[i] = super::filter_paeth_decode(a[i].into(), b[i].into(), c[i].into());
         }
-
-        // Uses logic from `filter::filter_paeth` to calculate absolute values
-        // entirely in `Simd<u8, N>`. This method avoids unpacking and packing
-        // penalties resulting from conversion to and from `Simd<i16, N>`.
-        // ```
-        //     let pa = b.max(c) - c.min(b);
-        //     let pb = a.max(c) - c.min(a);
-        //     let pc = if (a < c) == (c < b) {
-        //         pa.max(pb) - pa.min(pb)
-        //     } else {
-        //         255
-        //     };
-        // ```
-        let pa = abs_diff_simd(b, c);
-        let pb = abs_diff_simd(a, c);
-        let pc = a
-            .simd_lt(c)
-            .simd_eq(c.simd_lt(b))
-            .select(abs_diff_simd(pa, pb), Simd::splat(255));
-
-        let smallest = pc.simd_min(pa.simd_min(pb));
-
-        // Paeth algorithm breaks ties favoring a over b over c, so we execute the following
-        // lane-wise selection:
-        //
-        //     if smalest == pa
-        //         then select a
-        //         else select (if smallest == pb then select b else select c)
-        smallest
-            .simd_eq(pa)
-            .select(a, smallest.simd_eq(pb).select(b, c))
+        out.into()
     }
 
     /// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`).

From e0b19538147ca5ef0e36a2393730f40b9929bdb1 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 1 Dec 2024 20:42:30 +0000
Subject: [PATCH 08/16] cargo fmt

---
 src/filter.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/filter.rs b/src/filter.rs
index ef170f8a..f5bb959e 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -41,7 +41,7 @@ mod simd {
 
     /// This is an equivalent of the `PaethPredictor` function from
     /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
-    /// 
+    ///
     /// Mapping between parameter names and pixel positions can be found in
     /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
     ///

From 9ed0d9e4682383506b1c48fe7a80fde784457373 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 1 Dec 2024 21:04:23 +0000
Subject: [PATCH 09/16] ignore the paeth equivalence test by default to avoid
 inflating CI times

---
 src/filter.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/filter.rs b/src/filter.rs
index f5bb959e..87a30051 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -1113,6 +1113,7 @@ mod test {
     }
 
     #[test]
+    #[ignore] // takes ~20s without optimizations
     fn paeth_impls_are_equivalent() {
         use super::{filter_paeth, filter_paeth_decode};
         for a in 0..=255 {

From 3e083e9278d847c6a3724b5448282c3f4c6232d7 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 2 Dec 2024 02:52:20 +0000
Subject: [PATCH 10/16] Use the previous Paeth impl on non-x86 platforms

---
 src/filter.rs | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/filter.rs b/src/filter.rs
index 87a30051..a0a3424b 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -283,6 +283,7 @@ impl Default for AdaptiveFilterType {
     }
 }
 
+#[cfg(target_arch = "x86_64")]
 fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
     // Decoding optimizes better with this algorithm than with `filter_paeth()`
     //
@@ -300,7 +301,7 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
     return t1;
 }
 
-#[cfg(feature = "unstable")]
+#[cfg(all(feature = "unstable", target_arch = "x86_64"))]
 fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
     // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD
     let thresh = c * 3 - (a + b);
@@ -311,6 +312,48 @@ fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
     return t1;
 }
 
+#[cfg(not(target_arch = "x86_64"))]
+fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
+    // Decoding seems to optimize better with this algorithm
+    let pa = (i16::from(b) - i16::from(c)).abs();
+    let pb = (i16::from(a) - i16::from(c)).abs();
+    let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
+
+    let mut out = a;
+    let mut min = pa;
+
+    if pb < min {
+        min = pb;
+        out = b;
+    }
+    if pc < min {
+        out = c;
+    }
+
+    out
+}
+
+#[cfg(all(feature = "unstable", not(target_arch = "x86_64")))]
+fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
+    // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD
+    let pa = (b - c).abs();
+    let pb = (a - c).abs();
+    let pc = ((a - c) + (b - c)).abs();
+
+    let mut out = a;
+    let mut min = pa;
+
+    if pb < min {
+        min = pb;
+        out = b;
+    }
+    if pc < min {
+        out = c;
+    }
+
+    out
+}
+
 fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
     // This is an optimized version of the paeth filter from the PNG specification, proposed by
     // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates

From bb16b836bcacbdbf34a481f217b6ee3a9eb233e7 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 2 Dec 2024 20:34:48 +0000
Subject: [PATCH 11/16] Drop outdated comment

---
 src/filter.rs | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/filter.rs b/src/filter.rs
index a0a3424b..2293f732 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -39,16 +39,6 @@ mod simd {
         out.into()
     }
 
-    /// This is an equivalent of the `PaethPredictor` function from
-    /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
-    ///
-    /// Mapping between parameter names and pixel positions can be found in
-    /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
-    ///
-    /// Examples of how different pixel types may be represented as multiple SIMD lanes:
-    /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
-    /// - RGB  => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
-    ///
     /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert
     /// the SIMD elements to `i16`.
     fn paeth_predictor_u8<const N: usize>(

From c97f54c35a8fe0821587720aba3fe79dfd0772e4 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 2 Dec 2024 20:51:52 +0000
Subject: [PATCH 12/16] Disable unstable bpp=3 and bpp=6 codepaths on non-x86,
 add comments

---
 src/filter.rs | 34 +++++++++-------------------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/src/filter.rs b/src/filter.rs
index 2293f732..db02f26e 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -293,7 +293,8 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
 
 #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
 fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
-    // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD
+    // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD types.
+    // Used for bpp=3 and bpp=6
     let thresh = c * 3 - (a + b);
     let lo = a.min(b);
     let hi = a.max(b);
@@ -304,7 +305,9 @@ fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
 
 #[cfg(not(target_arch = "x86_64"))]
 fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
-    // Decoding seems to optimize better with this algorithm
+    // On ARM this algorithm performs much better than the one above adapted from stb,
+    // and this is the better-studied algorithm we've always used here,
+    // so we default to it on all non-x86 platforms.
     let pa = (i16::from(b) - i16::from(c)).abs();
     let pb = (i16::from(a) - i16::from(c)).abs();
     let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
@@ -323,27 +326,6 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
     out
 }
 
-#[cfg(all(feature = "unstable", not(target_arch = "x86_64")))]
-fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
-    // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD
-    let pa = (b - c).abs();
-    let pb = (a - c).abs();
-    let pc = ((a - c) + (b - c)).abs();
-
-    let mut out = a;
-    let mut min = pa;
-
-    if pb < min {
-        min = pb;
-        out = b;
-    }
-    if pc < min {
-        out = c;
-    }
-
-    out
-}
-
 fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
     // This is an optimized version of the paeth filter from the PNG specification, proposed by
     // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates
@@ -754,7 +736,8 @@ pub(crate) fn unfilter(
                     }
                 }
                 BytesPerPixel::Three => {
-                    #[cfg(feature = "unstable")]
+                    // Do not enable this algorithm on ARM, that would be a big performance hit
+                    #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
                     simd::unfilter_paeth3(previous, current);
 
                     #[cfg(not(feature = "unstable"))]
@@ -813,7 +796,8 @@ pub(crate) fn unfilter(
                     }
                 }
                 BytesPerPixel::Six => {
-                    #[cfg(feature = "unstable")]
+                    // Doesn't help performance on ARM, so not enabled to reduce on complexity
+                    #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
                     simd::unfilter_paeth6(previous, current);
 
                     #[cfg(not(feature = "unstable"))]

From 5c2538cdeb847287a4251bcb54c8693319970fdd Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 2 Dec 2024 21:01:32 +0000
Subject: [PATCH 13/16] Only build Paeth bpp=3 and bpp=6 routines on x86; fixes
 build on ARM

---
 src/filter.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/filter.rs b/src/filter.rs
index db02f26e..e03f015a 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -9,6 +9,8 @@ use crate::common::BytesPerPixel;
 /// feature of Rust gets stabilized.
 #[cfg(feature = "unstable")]
 mod simd {
+    // unused imports may ocur in this module due to conditional compilation
+    #![allow(unused_imports)]
     use std::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd};
     use std::simd::num::{SimdInt, SimdUint};
     use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount};
@@ -24,6 +26,7 @@ mod simd {
     /// Funnily, the autovectorizer does a better job here
     /// than a handwritten algorithm using std::simd!
     /// We used to have a handwritten one but this is just faster.
+    #[cfg(target_arch = "x86_64")]
     fn paeth_predictor<const N: usize>(
         a: Simd<i16, N>,
         b: Simd<i16, N>,
@@ -75,6 +78,7 @@ mod simd {
     ///
     /// `b` is the current pixel in the previous row.  `x` is the current pixel in the current row.
     /// See also https://www.w3.org/TR/png/#filter-byte-positions
+    #[cfg(target_arch = "x86_64")]
     fn paeth_step<const N: usize>(
         state: &mut PaethState<i16, N>,
         b: Simd<u8, N>,
@@ -112,15 +116,18 @@ mod simd {
         state.a = *x;
     }
 
+    #[cfg(target_arch = "x86_64")]
     fn load3(src: &[u8]) -> u8x4 {
         u8x4::from_array([src[0], src[1], src[2], 0])
     }
 
+    #[cfg(target_arch = "x86_64")]
     fn store3(src: u8x4, dest: &mut [u8]) {
         dest[0..3].copy_from_slice(&src.to_array()[0..3])
     }
 
     /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
+    #[cfg(target_arch = "x86_64")]
     pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
         debug_assert_eq!(prev_row.len(), curr_row.len());
         debug_assert_eq!(prev_row.len() % 3, 0);
@@ -175,15 +182,18 @@ mod simd {
         }
     }
 
+    #[cfg(target_arch = "x86_64")]
     fn load6(src: &[u8]) -> u8x8 {
         u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0])
     }
 
+    #[cfg(target_arch = "x86_64")]
     fn store6(src: u8x8, dest: &mut [u8]) {
         dest[0..6].copy_from_slice(&src.to_array()[0..6])
     }
 
     /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`.
+    #[cfg(target_arch = "x86_64")]
     pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
         debug_assert_eq!(prev_row.len(), curr_row.len());
         debug_assert_eq!(prev_row.len() % 6, 0);

From d5e97419ef2bd944df7ae307df5a11c1a582568d Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 2 Dec 2024 21:16:36 +0000
Subject: [PATCH 14/16] Run tests on ARM on CI

---
 .github/workflows/rust.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 09f514d9..07fdcdd8 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -54,7 +54,10 @@ jobs:
         rustup target add powerpc-unknown-linux-gnu
         cargo build --target powerpc-unknown-linux-gnu
   test_all:
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest] # macos-latest is ARM
+    runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v4
     - run: rustup default stable

From 54d2b7abbc990029b58f1f9f7769982db2cc5346 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 2 Dec 2024 21:27:13 +0000
Subject: [PATCH 15/16] Run cargo check --features=unstable on ARM too

---
 .github/workflows/rust.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 07fdcdd8..6a04428f 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -32,8 +32,9 @@ jobs:
   feature_check:
     strategy:
       matrix:
-        features: ["", "benchmarks"]
-    runs-on: ubuntu-latest
+        features: ["", "unstable", "benchmarks"]
+        os: [ubuntu-latest, macos-latest] # macos-latest is ARM
+    runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v4
     - uses: actions-rs/toolchain@v1

From 95fabd4a83d678b116c20d8bc36938d010f0b5a8 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 2 Dec 2024 21:58:32 +0000
Subject: [PATCH 16/16] Gate the entire SIMD module on x86_64 to reduce the
 amount of possible configurations in conditional compilation

---
 src/filter.rs | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/filter.rs b/src/filter.rs
index e03f015a..aa8c4aa4 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -7,11 +7,13 @@ use crate::common::BytesPerPixel;
 /// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the
 /// "unstable" feature of the `png` crate.  This should be possible once the "portable_simd"
 /// feature of Rust gets stabilized.
-#[cfg(feature = "unstable")]
+///
+/// This is only known to help on x86, with no change measured on most benchmarks on ARM,
+/// and even severely regressing some of them.
+/// So despite the code being portable, we only enable this for x86.
+/// We can add more platforms once this code is proven to be beneficial for them.
+#[cfg(all(feature = "unstable", target_arch = "x86_64"))]
 mod simd {
-    // unused imports may ocur in this module due to conditional compilation
-    #![allow(unused_imports)]
-    use std::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd};
     use std::simd::num::{SimdInt, SimdUint};
     use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount};
 
@@ -26,7 +28,6 @@ mod simd {
     /// Funnily, the autovectorizer does a better job here
     /// than a handwritten algorithm using std::simd!
     /// We used to have a handwritten one but this is just faster.
-    #[cfg(target_arch = "x86_64")]
     fn paeth_predictor<const N: usize>(
         a: Simd<i16, N>,
         b: Simd<i16, N>,
@@ -78,7 +79,6 @@ mod simd {
     ///
     /// `b` is the current pixel in the previous row.  `x` is the current pixel in the current row.
     /// See also https://www.w3.org/TR/png/#filter-byte-positions
-    #[cfg(target_arch = "x86_64")]
     fn paeth_step<const N: usize>(
         state: &mut PaethState<i16, N>,
         b: Simd<u8, N>,
@@ -116,18 +116,15 @@ mod simd {
         state.a = *x;
     }
 
-    #[cfg(target_arch = "x86_64")]
     fn load3(src: &[u8]) -> u8x4 {
         u8x4::from_array([src[0], src[1], src[2], 0])
     }
 
-    #[cfg(target_arch = "x86_64")]
     fn store3(src: u8x4, dest: &mut [u8]) {
         dest[0..3].copy_from_slice(&src.to_array()[0..3])
     }
 
     /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
-    #[cfg(target_arch = "x86_64")]
     pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
         debug_assert_eq!(prev_row.len(), curr_row.len());
         debug_assert_eq!(prev_row.len() % 3, 0);
@@ -182,18 +179,15 @@ mod simd {
         }
     }
 
-    #[cfg(target_arch = "x86_64")]
     fn load6(src: &[u8]) -> u8x8 {
         u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0])
     }
 
-    #[cfg(target_arch = "x86_64")]
     fn store6(src: u8x8, dest: &mut [u8]) {
         dest[0..6].copy_from_slice(&src.to_array()[0..6])
     }
 
     /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`.
-    #[cfg(target_arch = "x86_64")]
     pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
         debug_assert_eq!(prev_row.len(), curr_row.len());
         debug_assert_eq!(prev_row.len() % 6, 0);
@@ -775,7 +769,7 @@ pub(crate) fn unfilter(
                     }
                 }
                 BytesPerPixel::Four => {
-                    #[cfg(feature = "unstable")]
+                    #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
                     simd::unfilter_paeth_u8::<4>(previous, current);
 
                     #[cfg(not(feature = "unstable"))]
@@ -806,7 +800,6 @@ pub(crate) fn unfilter(
                     }
                 }
                 BytesPerPixel::Six => {
-                    // Doesn't help performance on ARM, so not enabled to reduce on complexity
                     #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
                     simd::unfilter_paeth6(previous, current);
 
@@ -844,7 +837,7 @@ pub(crate) fn unfilter(
                     }
                 }
                 BytesPerPixel::Eight => {
-                    #[cfg(feature = "unstable")]
+                    #[cfg(all(feature = "unstable", target_arch = "x86_64"))]
                     simd::unfilter_paeth_u8::<8>(previous, current);
 
                     #[cfg(not(feature = "unstable"))]