Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,274 changes: 1,270 additions & 4 deletions fearless_simd/src/generated/avx2.rs

Large diffs are not rendered by default.

708 changes: 708 additions & 0 deletions fearless_simd/src/generated/fallback.rs

Large diffs are not rendered by default.

1,791 changes: 1,791 additions & 0 deletions fearless_simd/src/generated/neon.rs

Large diffs are not rendered by default.

344 changes: 342 additions & 2 deletions fearless_simd/src/generated/simd_trait.rs

Large diffs are not rendered by default.

360 changes: 360 additions & 0 deletions fearless_simd/src/generated/simd_types.rs

Large diffs are not rendered by default.

1,141 changes: 1,141 additions & 0 deletions fearless_simd/src/generated/sse4_2.rs

Large diffs are not rendered by default.

1,155 changes: 1,155 additions & 0 deletions fearless_simd/src/generated/wasm.rs

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions fearless_simd/src/support.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,24 @@ pub(crate) fn simd_debug_impl<Element: core::fmt::Debug>(
.field("simd", token)
.finish()
}

/// Selects the input operands to be used for `slignr`/`vext`/etc. when computing a single output block for cross-block
/// "slide" operations. Extracts from [a : b].
#[inline(always)]
#[allow(clippy::allow_attributes, reason = "Only needed in some cfgs.")]
#[allow(dead_code, reason = "Only used in some cfgs.")]
pub(crate) fn cross_block_slide_blocks_at<const N: usize, Block: Copy>(
a: &[Block; N],
b: &[Block; N],
out_idx: usize,
shift_bytes: usize,
) -> [Block; 2] {
const BLOCK_BYTES: usize = 16;
let out_byte_start = out_idx * BLOCK_BYTES + shift_bytes;
let lo_idx = out_byte_start.div_euclid(BLOCK_BYTES);
let hi_idx = lo_idx + 1;
// Concatenation is [a : b], so indices 0..N are from a, indices N..2N are from b
let lo_block = if lo_idx < N { a[lo_idx] } else { b[lo_idx - N] };
let hi_block = if hi_idx < N { a[hi_idx] } else { b[hi_idx - N] };
[lo_block, hi_block]
}
11 changes: 9 additions & 2 deletions fearless_simd_dev_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
fn #sse4_name() {
if std::arch::is_x86_feature_detected!("sse4.2") {
let sse4 = unsafe { fearless_simd::x86::Sse4_2::new_unchecked() };
#input_fn_name(sse4);
sse4.vectorize(
#[inline(always)]
|| #input_fn_name(sse4)
);
}
}
};
Expand All @@ -94,7 +97,10 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
&& std::arch::is_x86_feature_detected!("fma")
{
let avx2 = unsafe { fearless_simd::x86::Avx2::new_unchecked() };
#input_fn_name(avx2);
avx2.vectorize(
#[inline(always)]
|| #input_fn_name(avx2)
);
}
}
};
Expand All @@ -110,6 +116,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
};

quote! {
#[inline(always)]
#input_fn

#fallback_snippet
Expand Down
28 changes: 27 additions & 1 deletion fearless_simd_gen/src/generic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use proc_macro2::{Ident, Span, TokenStream};
use quote::{ToTokens, quote};

use crate::{
ops::{Op, OpSig, RefKind},
ops::{Op, OpSig, RefKind, SlideGranularity},
types::{ScalarType, VecType},
};

Expand Down Expand Up @@ -203,6 +203,32 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream {
}
OpSig::FromBytes => generic_from_bytes(method_sig, ty),
OpSig::ToBytes => generic_to_bytes(method_sig, ty),
OpSig::Slide { granularity, .. } => {
match (granularity, ty.n_bits()) {
(SlideGranularity::WithinBlocks, 128) => {
// If this operation is done on a 128-bit vector type, the "within blocks" method is identical to the
// non-within-blocks one, so just defer to that.
let non_blockwise = generic_op_name("slide", ty);
quote! {
#method_sig {
self.#non_blockwise::<SHIFT>(a, b)
}
}
}
(SlideGranularity::WithinBlocks, _) => {
quote! {
#method_sig {
let (a0, a1) = self.#split(a);
let (b0, b1) = self.#split(b);
self.#combine(self.#do_half::<SHIFT>(a0, b0), self.#do_half::<SHIFT>(a1, b1))
}
}
}
_ => {
panic!("Item-wise shifts across blocks cannot be done via split/combine");
}
}
}
}
}

Expand Down
9 changes: 9 additions & 0 deletions fearless_simd_gen/src/level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ pub(crate) trait Level {
}
}

/// Any additional supporting code necessary for the module's implementation, but placed *after* the `Simd`
/// implementation itself.
fn make_module_footer(&self) -> TokenStream {
TokenStream::new()
}

/// The body of the `Simd::level` function. This can be overridden, e.g. to return `Level::baseline()` if we know a
/// higher SIMD level is statically enabled.
fn make_level_body(&self) -> TokenStream {
Expand Down Expand Up @@ -209,6 +215,7 @@ pub(crate) trait Level {
let arch_types_impl = self.impl_arch_types();
let simd_impl = self.make_simd_impl();
let ty_impl = self.make_type_impl();
let footer = self.make_module_footer();

quote! {
use crate::{prelude::*, seal::Seal, arch_types::ArchTypes, Level};
Expand All @@ -234,6 +241,8 @@ pub(crate) trait Level {
#simd_impl

#ty_impl

#footer
}
}
}
11 changes: 11 additions & 0 deletions fearless_simd_gen/src/mk_fallback.rs
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,17 @@ impl Level for Fallback {
}
}
}
OpSig::Slide { .. } => {
let n = vec_ty.len;
quote! {
#method_sig {
let mut dest = [Default::default(); #n];
dest[..#n - SHIFT].copy_from_slice(&a.val.0[SHIFT..]);
dest[#n - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]);
dest.simd_into(self)
}
}
}
OpSig::Cvt {
target_ty,
scalar_bits,
Expand Down
94 changes: 93 additions & 1 deletion fearless_simd_gen/src/mk_neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::generic::{
generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_to_bytes,
};
use crate::level::Level;
use crate::ops::{Op, valid_reinterpret};
use crate::ops::{Op, SlideGranularity, valid_reinterpret};
use crate::{
arch::neon::{self, cvt_intrinsic, simple_intrinsic, split_intrinsic},
ops::OpSig,
Expand Down Expand Up @@ -66,6 +66,10 @@ impl Level for Neon {
}
}

fn make_module_footer(&self) -> TokenStream {
mk_slide_helpers()
}

fn make_impl_body(&self) -> TokenStream {
quote! {
#[inline]
Expand Down Expand Up @@ -395,6 +399,72 @@ impl Level for Neon {
}
}
}
OpSig::Slide { granularity } => {
use SlideGranularity::*;

let block_wrapper = vec_ty.aligned_wrapper();
let bytes_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8);
let combined_bytes = bytes_ty.rust();
let scalar_bytes = vec_ty.scalar_bits / 8;
let num_items = vec_ty.len;
let to_bytes = generic_op_name("cvt_to_bytes", vec_ty);
let from_bytes = generic_op_name("cvt_from_bytes", vec_ty);

let byte_shift = if scalar_bytes == 1 {
quote! { SHIFT }
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case we probably wouldn't need to call the dyn methods either but can just call the intrinsic, right?

} else {
quote! { SHIFT * #scalar_bytes }
};

let bytes_expr = match (granularity, vec_ty.n_bits()) {
(WithinBlocks, 128) => {
panic!("This should have been handled by generic_op");
}
(WithinBlocks, _) | (_, 128) => {
quote! {
unsafe {
dyn_vext_128(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift)
}
}
}
(AcrossBlocks, 256 | 512) => {
let num_blocks = vec_ty.n_bits() / 128;

// Ranges are not `Copy`, so we need to create a new range iterator for each usage
let blocks = (0..num_blocks).map(Literal::usize_unsuffixed);
let blocks2 = blocks.clone();
let blocks3 = blocks.clone();
let bytes_arch_ty = self.arch_ty(&bytes_ty);

quote! {
unsafe {
let a_bytes = self.#to_bytes(a).val.0;
let b_bytes = self.#to_bytes(b).val.0;
let a_blocks = [#( a_bytes.#blocks ),*];
let b_blocks = [#( b_bytes.#blocks2 ),*];

let shift_bytes = #byte_shift;
#bytes_arch_ty(#({
let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a_blocks, &b_blocks, #blocks3, shift_bytes);
dyn_vext_128(lo, hi, shift_bytes % 16)
}),*)
}
}
}
_ => unimplemented!(),
};

quote! {
#method_sig {
if SHIFT >= #num_items {
return b;
}

let result = #bytes_expr;
self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
}
}
}
OpSig::Cvt {
target_ty,
scalar_bits,
Expand Down Expand Up @@ -478,3 +548,25 @@ impl Level for Neon {
}
}
}

fn mk_slide_helpers() -> TokenStream {
let shifts = (0_usize..16).map(|shift| {
let shift_i32 = i32::try_from(shift).unwrap();
quote! { #shift => vextq_u8::<#shift_i32>(a, b) }
});

quote! {
/// This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still
/// expected to be constant in practice, so the match statement will be optimized out. This exists because
/// Rust doesn't currently let you do math on const generics.
#[inline(always)]
unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we already have an unsafe block inside we don't need it on the function itself, no?

unsafe {
match shift {
#(#shifts,)*
_ => unreachable!()
}
}
}
}
}
17 changes: 14 additions & 3 deletions fearless_simd_gen/src/mk_simd_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use proc_macro2::TokenStream;
use quote::quote;

use crate::{
ops::{OpKind, TyFlavor, ops_for_type, overloaded_ops_for, vec_trait_ops_for},
ops::{OpKind, TyFlavor, base_trait_ops, ops_for_type, overloaded_ops_for, vec_trait_ops_for},
types::{SIMD_TYPES, ScalarType, type_imports},
};

Expand Down Expand Up @@ -130,6 +130,17 @@ pub(crate) fn mk_arch_types() -> TokenStream {
}

fn mk_simd_base() -> TokenStream {
let mut methods = vec![];
for op in base_trait_ops() {
let doc = op.format_docstring(TyFlavor::VecImpl);
if let Some(method_sig) = op.vec_trait_method_sig() {
methods.push(quote! {
#[doc = #doc]
#method_sig;
});
}
}

quote! {
/// Base functionality implemented by all SIMD vectors.
pub trait SimdBase<S: Simd>:
Expand Down Expand Up @@ -167,15 +178,15 @@ fn mk_simd_base() -> TokenStream {
///
/// The slice must be the proper width.
fn from_slice(simd: S, slice: &[Self::Element]) -> Self;
/// Create a SIMD vector with all elements set to the given value.
fn splat(simd: S, val: Self::Element) -> Self;
/// Create a SIMD vector from a 128-bit vector of the same scalar
/// type, repeated.
fn block_splat(block: Self::Block) -> Self;
/// Create a SIMD vector where each element is produced by
/// calling `f` with that element's lane index (from 0 to
/// [`SimdBase::N`] - 1).
fn from_fn(simd: S, f: impl FnMut(usize) -> Self::Element) -> Self;

#( #methods )*
}
}
}
Expand Down
11 changes: 11 additions & 0 deletions fearless_simd_gen/src/mk_simd_types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,8 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream {
let from_array_op = generic_op_name("load_array", ty);
let as_array_ref_op = generic_op_name("as_array_ref", ty);
let as_array_mut_op = generic_op_name("as_array_mut", ty);
let slide_op = generic_op_name("slide", ty);
let slide_blockwise_op = generic_op_name("slide_within_blocks", ty);
quote! {
impl<S: Simd> SimdBase<S> for #name<S> {
type Element = #scalar;
Expand Down Expand Up @@ -334,6 +336,15 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream {
simd.#from_array_op(core::array::from_fn(f))
}

#[inline(always)]
fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
self.simd.#slide_op::<SHIFT>(self, rhs.simd_into(self.simd))
}

#[inline(always)]
fn slide_within_blocks<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
self.simd.#slide_blockwise_op::<SHIFT>(self, rhs.simd_into(self.simd))
}
}
impl<S: Simd> crate::#vec_trait_id<S> for #name<S> {
#( #methods )*
Expand Down
Loading