-
Notifications
You must be signed in to change notification settings - Fork 21
Add a "slide" operation (like x86's alignr and ARM's vext)
#164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,7 +9,7 @@ use crate::generic::{ | |
| generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_to_bytes, | ||
| }; | ||
| use crate::level::Level; | ||
| use crate::ops::{Op, valid_reinterpret}; | ||
| use crate::ops::{Op, SlideGranularity, valid_reinterpret}; | ||
| use crate::{ | ||
| arch::neon::{self, cvt_intrinsic, simple_intrinsic, split_intrinsic}, | ||
| ops::OpSig, | ||
|
|
@@ -66,6 +66,10 @@ impl Level for Neon { | |
| } | ||
| } | ||
|
|
||
| fn make_module_footer(&self) -> TokenStream { | ||
| mk_slide_helpers() | ||
| } | ||
|
|
||
| fn make_impl_body(&self) -> TokenStream { | ||
| quote! { | ||
| #[inline] | ||
|
|
@@ -395,6 +399,72 @@ impl Level for Neon { | |
| } | ||
| } | ||
| } | ||
| OpSig::Slide { granularity } => { | ||
| use SlideGranularity::*; | ||
|
|
||
| let block_wrapper = vec_ty.aligned_wrapper(); | ||
| let bytes_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8); | ||
| let combined_bytes = bytes_ty.rust(); | ||
| let scalar_bytes = vec_ty.scalar_bits / 8; | ||
| let num_items = vec_ty.len; | ||
| let to_bytes = generic_op_name("cvt_to_bytes", vec_ty); | ||
| let from_bytes = generic_op_name("cvt_from_bytes", vec_ty); | ||
|
|
||
| let byte_shift = if scalar_bytes == 1 { | ||
| quote! { SHIFT } | ||
| } else { | ||
| quote! { SHIFT * #scalar_bytes } | ||
| }; | ||
|
|
||
| let bytes_expr = match (granularity, vec_ty.n_bits()) { | ||
| (WithinBlocks, 128) => { | ||
| panic!("This should have been handled by generic_op"); | ||
| } | ||
| (WithinBlocks, _) | (_, 128) => { | ||
| quote! { | ||
| unsafe { | ||
| dyn_vext_128(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift) | ||
| } | ||
| } | ||
| } | ||
| (AcrossBlocks, 256 | 512) => { | ||
| let num_blocks = vec_ty.n_bits() / 128; | ||
|
|
||
| // Ranges are not `Copy`, so we need to create a new range iterator for each usage | ||
| let blocks = (0..num_blocks).map(Literal::usize_unsuffixed); | ||
| let blocks2 = blocks.clone(); | ||
| let blocks3 = blocks.clone(); | ||
| let bytes_arch_ty = self.arch_ty(&bytes_ty); | ||
|
|
||
| quote! { | ||
| unsafe { | ||
| let a_bytes = self.#to_bytes(a).val.0; | ||
| let b_bytes = self.#to_bytes(b).val.0; | ||
| let a_blocks = [#( a_bytes.#blocks ),*]; | ||
| let b_blocks = [#( b_bytes.#blocks2 ),*]; | ||
|
|
||
| let shift_bytes = #byte_shift; | ||
| #bytes_arch_ty(#({ | ||
| let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a_blocks, &b_blocks, #blocks3, shift_bytes); | ||
| dyn_vext_128(lo, hi, shift_bytes % 16) | ||
| }),*) | ||
| } | ||
| } | ||
| } | ||
| _ => unimplemented!(), | ||
| }; | ||
|
|
||
| quote! { | ||
| #method_sig { | ||
| if SHIFT >= #num_items { | ||
| return b; | ||
| } | ||
|
|
||
| let result = #bytes_expr; | ||
| self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) | ||
| } | ||
| } | ||
| } | ||
| OpSig::Cvt { | ||
| target_ty, | ||
| scalar_bits, | ||
|
|
@@ -478,3 +548,25 @@ impl Level for Neon { | |
| } | ||
| } | ||
| } | ||
|
|
||
| fn mk_slide_helpers() -> TokenStream { | ||
| let shifts = (0_usize..16).map(|shift| { | ||
| let shift_i32 = i32::try_from(shift).unwrap(); | ||
| quote! { #shift => vextq_u8::<#shift_i32>(a, b) } | ||
| }); | ||
|
|
||
| quote! { | ||
| /// This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still | ||
| /// expected to be constant in practice, so the match statement will be optimized out. This exists because | ||
| /// Rust doesn't currently let you do math on const generics. | ||
| #[inline(always)] | ||
| unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we already have an unsafe block inside we don't need it on the function itself, no? |
||
| unsafe { | ||
| match shift { | ||
| #(#shifts,)* | ||
| _ => unreachable!() | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this case we probably wouldn't need to call the dyn methods either but can just call the intrinsic, right?