linebender · valadaptive · Dec 13, 2025 · Dec 13, 2025 · Dec 21, 2025 · Dec 21, 2025
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
diff --git a/fearless_simd/src/support.rs b/fearless_simd/src/support.rs
@@ -42,3 +42,24 @@ pub(crate) fn simd_debug_impl<Element: core::fmt::Debug>(
         .field("simd", token)
         .finish()
 }
+
+/// Selects the input operands to be used for `slignr`/`vext`/etc. when computing a single output block for cross-block
+/// "slide" operations. Extracts from [a : b].
+#[inline(always)]
+#[allow(clippy::allow_attributes, reason = "Only needed in some cfgs.")]
+#[allow(dead_code, reason = "Only used in some cfgs.")]
+pub(crate) fn cross_block_slide_blocks_at<const N: usize, Block: Copy>(
+    a: &[Block; N],
+    b: &[Block; N],
+    out_idx: usize,
+    shift_bytes: usize,
+) -> [Block; 2] {
+    const BLOCK_BYTES: usize = 16;
+    let out_byte_start = out_idx * BLOCK_BYTES + shift_bytes;
+    let lo_idx = out_byte_start.div_euclid(BLOCK_BYTES);
+    let hi_idx = lo_idx + 1;
+    // Concatenation is [a : b], so indices 0..N are from a, indices N..2N are from b
+    let lo_block = if lo_idx < N { a[lo_idx] } else { b[lo_idx - N] };
+    let hi_block = if hi_idx < N { a[hi_idx] } else { b[hi_idx - N] };
+    [lo_block, hi_block]
+}
diff --git a/fearless_simd_dev_macros/src/lib.rs b/fearless_simd_dev_macros/src/lib.rs
@@ -80,7 +80,10 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
         fn #sse4_name() {
             if std::arch::is_x86_feature_detected!("sse4.2") {
                 let sse4 = unsafe { fearless_simd::x86::Sse4_2::new_unchecked() };
-                #input_fn_name(sse4);
+                sse4.vectorize(
+                    #[inline(always)]
+                    || #input_fn_name(sse4)
+                );
             }
         }
     };
@@ -94,7 +97,10 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
                 && std::arch::is_x86_feature_detected!("fma")
             {
                 let avx2 = unsafe { fearless_simd::x86::Avx2::new_unchecked() };
-                #input_fn_name(avx2);
+                avx2.vectorize(
+                    #[inline(always)]
+                    || #input_fn_name(avx2)
+                );
             }
         }
     };
@@ -110,6 +116,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
     };
 
     quote! {
+        #[inline(always)]
         #input_fn
 
         #fallback_snippet

diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs
@@ -5,7 +5,7 @@ use proc_macro2::{Ident, Span, TokenStream};
 use quote::{ToTokens, quote};
 
 use crate::{
-    ops::{Op, OpSig, RefKind},
+    ops::{Op, OpSig, RefKind, SlideGranularity},
     types::{ScalarType, VecType},
 };
 
@@ -203,6 +203,32 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream {
         }
         OpSig::FromBytes => generic_from_bytes(method_sig, ty),
         OpSig::ToBytes => generic_to_bytes(method_sig, ty),
+        OpSig::Slide { granularity, .. } => {
+            match (granularity, ty.n_bits()) {
+                (SlideGranularity::WithinBlocks, 128) => {
+                    // If this operation is done on a 128-bit vector type, the "within blocks" method is identical to the
+                    // non-within-blocks one, so just defer to that.
+                    let non_blockwise = generic_op_name("slide", ty);
+                    quote! {
+                        #method_sig {
+                            self.#non_blockwise::<SHIFT>(a, b)
+                        }
+                    }
+                }
+                (SlideGranularity::WithinBlocks, _) => {
+                    quote! {
+                        #method_sig {
+                            let (a0, a1) = self.#split(a);
+                            let (b0, b1) = self.#split(b);
+                            self.#combine(self.#do_half::<SHIFT>(a0, b0), self.#do_half::<SHIFT>(a1, b1))
+                        }
+                    }
+                }
+                _ => {
+                    panic!("Item-wise shifts across blocks cannot be done via split/combine");
+                }
+            }
+        }
     }
 }
 

diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs
@@ -72,6 +72,12 @@ pub(crate) trait Level {
         }
     }
 
+    /// Any additional supporting code necessary for the module's implementation, but placed *after* the `Simd`
+    /// implementation itself.
+    fn make_module_footer(&self) -> TokenStream {
+        TokenStream::new()
+    }
+
     /// The body of the `Simd::level` function. This can be overridden, e.g. to return `Level::baseline()` if we know a
     /// higher SIMD level is statically enabled.
     fn make_level_body(&self) -> TokenStream {
@@ -209,6 +215,7 @@ pub(crate) trait Level {
         let arch_types_impl = self.impl_arch_types();
         let simd_impl = self.make_simd_impl();
         let ty_impl = self.make_type_impl();
+        let footer = self.make_module_footer();
 
         quote! {
             use crate::{prelude::*, seal::Seal, arch_types::ArchTypes, Level};
@@ -234,6 +241,8 @@ pub(crate) trait Level {
             #simd_impl
 
             #ty_impl
+
+            #footer
         }
     }
 }
diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
@@ -369,6 +369,17 @@ impl Level for Fallback {
                     }
                 }
             }
+            OpSig::Slide { .. } => {
+                let n = vec_ty.len;
+                quote! {
+                    #method_sig {
+                        let mut dest = [Default::default(); #n];
+                        dest[..#n - SHIFT].copy_from_slice(&a.val.0[SHIFT..]);
+                        dest[#n - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]);
+                        dest.simd_into(self)
+                    }
+                }
+            }
             OpSig::Cvt {
                 target_ty,
                 scalar_bits,

diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
@@ -9,7 +9,7 @@ use crate::generic::{
     generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_to_bytes,
 };
 use crate::level::Level;
-use crate::ops::{Op, valid_reinterpret};
+use crate::ops::{Op, SlideGranularity, valid_reinterpret};
 use crate::{
     arch::neon::{self, cvt_intrinsic, simple_intrinsic, split_intrinsic},
     ops::OpSig,
@@ -66,6 +66,10 @@ impl Level for Neon {
         }
     }
 
+    fn make_module_footer(&self) -> TokenStream {
+        mk_slide_helpers()
+    }
+
     fn make_impl_body(&self) -> TokenStream {
         quote! {
             #[inline]
@@ -395,6 +399,72 @@ impl Level for Neon {
                     }
                 }
             }
+            OpSig::Slide { granularity } => {
+                use SlideGranularity::*;
+
+                let block_wrapper = vec_ty.aligned_wrapper();
+                let bytes_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8);
+                let combined_bytes = bytes_ty.rust();
+                let scalar_bytes = vec_ty.scalar_bits / 8;
+                let num_items = vec_ty.len;
+                let to_bytes = generic_op_name("cvt_to_bytes", vec_ty);
+                let from_bytes = generic_op_name("cvt_from_bytes", vec_ty);
+
+                let byte_shift = if scalar_bytes == 1 {
+                    quote! { SHIFT }
+                } else {
+                    quote! { SHIFT * #scalar_bytes }
+                };
+
+                let bytes_expr = match (granularity, vec_ty.n_bits()) {
+                    (WithinBlocks, 128) => {
+                        panic!("This should have been handled by generic_op");
+                    }
+                    (WithinBlocks, _) | (_, 128) => {
+                        quote! {
+                            unsafe {
+                                dyn_vext_128(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift)
+                            }
+                        }
+                    }
+                    (AcrossBlocks, 256 | 512) => {
+                        let num_blocks = vec_ty.n_bits() / 128;
+
+                        // Ranges are not `Copy`, so we need to create a new range iterator for each usage
+                        let blocks = (0..num_blocks).map(Literal::usize_unsuffixed);
+                        let blocks2 = blocks.clone();
+                        let blocks3 = blocks.clone();
+                        let bytes_arch_ty = self.arch_ty(&bytes_ty);
+
+                        quote! {
+                            unsafe {
+                                let a_bytes = self.#to_bytes(a).val.0;
+                                let b_bytes = self.#to_bytes(b).val.0;
+                                let a_blocks = [#( a_bytes.#blocks ),*];
+                                let b_blocks = [#( b_bytes.#blocks2 ),*];
+
+                                let shift_bytes = #byte_shift;
+                                #bytes_arch_ty(#({
+                                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a_blocks, &b_blocks, #blocks3, shift_bytes);
+                                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                                }),*)
+                            }
+                        }
+                    }
+                    _ => unimplemented!(),
+                };
+
+                quote! {
+                    #method_sig {
+                        if SHIFT >= #num_items {
+                            return b;
+                        }
+
+                        let result = #bytes_expr;
+                        self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
+                    }
+                }
+            }
             OpSig::Cvt {
                 target_ty,
                 scalar_bits,
@@ -478,3 +548,25 @@ impl Level for Neon {
         }
     }
 }
+
+fn mk_slide_helpers() -> TokenStream {
+    let shifts = (0_usize..16).map(|shift| {
+        let shift_i32 = i32::try_from(shift).unwrap();
+        quote! { #shift => vextq_u8::<#shift_i32>(a, b) }
+    });
+
+    quote! {
+        /// This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still
+        /// expected to be constant in practice, so the match statement will be optimized out. This exists because
+        /// Rust doesn't currently let you do math on const generics.
+        #[inline(always)]
+        unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t {
+            unsafe {
+                match shift {
+                    #(#shifts,)*
+                    _ => unreachable!()
+                }
+            }
+        }
+    }
+}
diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs
@@ -5,7 +5,7 @@ use proc_macro2::TokenStream;
 use quote::quote;
 
 use crate::{
-    ops::{OpKind, TyFlavor, ops_for_type, overloaded_ops_for, vec_trait_ops_for},
+    ops::{OpKind, TyFlavor, base_trait_ops, ops_for_type, overloaded_ops_for, vec_trait_ops_for},
     types::{SIMD_TYPES, ScalarType, type_imports},
 };
 
@@ -130,6 +130,17 @@ pub(crate) fn mk_arch_types() -> TokenStream {
 }
 
 fn mk_simd_base() -> TokenStream {
+    let mut methods = vec![];
+    for op in base_trait_ops() {
+        let doc = op.format_docstring(TyFlavor::VecImpl);
+        if let Some(method_sig) = op.vec_trait_method_sig() {
+            methods.push(quote! {
+                #[doc = #doc]
+                #method_sig;
+            });
+        }
+    }
+
     quote! {
         /// Base functionality implemented by all SIMD vectors.
         pub trait SimdBase<S: Simd>:
@@ -167,15 +178,15 @@ fn mk_simd_base() -> TokenStream {
             ///
             /// The slice must be the proper width.
             fn from_slice(simd: S, slice: &[Self::Element]) -> Self;
-            /// Create a SIMD vector with all elements set to the given value.
-            fn splat(simd: S, val: Self::Element) -> Self;
             /// Create a SIMD vector from a 128-bit vector of the same scalar
             /// type, repeated.
             fn block_splat(block: Self::Block) -> Self;
             /// Create a SIMD vector where each element is produced by
             /// calling `f` with that element's lane index (from 0 to
             /// [`SimdBase::N`] - 1).
             fn from_fn(simd: S, f: impl FnMut(usize) -> Self::Element) -> Self;
+
+            #( #methods )*
         }
     }
 }

diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs
@@ -291,6 +291,8 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream {
     let from_array_op = generic_op_name("load_array", ty);
     let as_array_ref_op = generic_op_name("as_array_ref", ty);
     let as_array_mut_op = generic_op_name("as_array_mut", ty);
+    let slide_op = generic_op_name("slide", ty);
+    let slide_blockwise_op = generic_op_name("slide_within_blocks", ty);
     quote! {
         impl<S: Simd> SimdBase<S> for #name<S> {
             type Element = #scalar;
@@ -334,6 +336,15 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream {
                 simd.#from_array_op(core::array::from_fn(f))
             }
 
+            #[inline(always)]
+            fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+                self.simd.#slide_op::<SHIFT>(self, rhs.simd_into(self.simd))
+            }
+
+            #[inline(always)]
+            fn slide_within_blocks<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+                self.simd.#slide_blockwise_op::<SHIFT>(self, rhs.simd_into(self.simd))
+            }
         }
         impl<S: Simd> crate::#vec_trait_id<S> for #name<S> {
             #( #methods )*