diff --git a/src/neon/jzazbz_to_image.rs b/src/neon/jzazbz_to_image.rs index 1e40b0d..fc26c4b 100644 --- a/src/neon/jzazbz_to_image.rs +++ b/src/neon/jzazbz_to_image.rs @@ -288,5 +288,61 @@ pub unsafe fn neon_jzazbz_to_image( + src_ptr_0, + transfer_function, + target, + luminance_scale, + ); + + let zeros = vdup_n_u16(0); + + let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), zeros); + let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), zeros); + let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), zeros); + + let r_row = vqmovn_u16(r_row01); + let g_row = vqmovn_u16(g_row01); + let b_row = vqmovn_u16(b_row01); + + let dst_ptr = dst.add(dst_offset as usize + cx * channels); + + if image_configuration.has_alpha() { + let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), zeros); + let a_row = vqmovn_u16(a_row01); + let store_rows = match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Rgba => { + uint8x8x4_t(r_row, g_row, b_row, a_row) + } + ImageConfiguration::Bgra | ImageConfiguration::Bgr => { + uint8x8x4_t(b_row, g_row, r_row, a_row) + } + }; + let mut transient: [u8; 32] = [0; 32]; + vst4_u8(transient.as_mut_ptr(), store_rows); + std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4); + } else { + let store_rows = match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Rgba => { + uint8x8x3_t(r_row, g_row, b_row) + } + ImageConfiguration::Bgra | ImageConfiguration::Bgr => { + uint8x8x3_t(b_row, g_row, r_row) + } + }; + let mut transient: [u8; 24] = [0; 24]; + vst3_u8(transient.as_mut_ptr(), store_rows); + std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 3); + } + + cx += 4; + } + cx } diff --git a/src/neon/to_linear_u8.rs b/src/neon/to_linear_u8.rs index 84e2d73..ca2a12a 100644 --- a/src/neon/to_linear_u8.rs +++ b/src/neon/to_linear_u8.rs @@ -7,7 +7,10 @@ use crate::image::ImageConfiguration; use crate::neon::{get_neon_gamma_transfer, get_neon_linear_transfer}; -use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half, TransferFunction}; +use crate::{ + load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter, + TransferFunction, +}; use std::arch::aarch64::*; #[inline(always)] @@ -156,16 +159,60 @@ pub unsafe fn neon_channels_to_linear_u8< let b_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), vmovn_u32(z_low_high))); + let dst = dst_ptr.add(cx * channels); + if USE_ALPHA { let v_4 = uint8x8x4_t(r_u_norm, g_u_norm, b_u_norm, vget_low_u8(a_chan)); - vst4_u8(dst_ptr.add(cx * channels), v_4); + vst4_u8(dst, v_4); } else { let v_4 = uint8x8x3_t(r_u_norm, g_u_norm, b_u_norm); - vst3_u8(dst_ptr.add(cx * channels), v_4); + vst3_u8(dst, v_4); } cx += 8; } + while cx + 4 < width as usize { + let src_ptr = src.add(src_offset + cx * channels); + + let (r_chan, g_chan, b_chan, a_chan) = + load_u8_and_deinterleave_quarter!(src_ptr, image_configuration); + + let r_low = vmovl_u8(vget_low_u8(r_chan)); + let g_low = vmovl_u8(vget_low_u8(g_chan)); + let b_low = vmovl_u8(vget_low_u8(b_chan)); + + let r_low_low = vmovl_u16(vget_low_u16(r_low)); + let g_low_low = vmovl_u16(vget_low_u16(g_low)); + let b_low_low = vmovl_u16(vget_low_u16(b_low)); + + let (x_low_low, y_low_low, z_low_low) = + neon_triple_to_linear_u8(r_low_low, g_low_low, b_low_low, &transfer); + + let zeros = vdup_n_u16(0); + + let r_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(x_low_low), zeros)); + + let g_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(y_low_low), zeros)); + + let b_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), zeros)); + + let dst = dst_ptr.add(cx * channels); + + if USE_ALPHA { + let v_4 = uint8x8x4_t(r_u_norm, g_u_norm, b_u_norm, vget_low_u8(a_chan)); + let mut transient: [u8; 32] = [0; 32]; + vst4_u8(transient.as_mut_ptr(), v_4); + std::ptr::copy_nonoverlapping(transient.as_ptr(), dst, 4 * 4); + } else { + let v_3 = uint8x8x3_t(r_u_norm, g_u_norm, b_u_norm); + let mut transient: [u8; 24] = [0; 24]; + vst3_u8(transient.as_mut_ptr(), v_3); + std::ptr::copy_nonoverlapping(transient.as_ptr(), dst, 4 * 3); + } + + cx += 4; + } + cx } diff --git a/src/neon/to_sigmoidal.rs b/src/neon/to_sigmoidal.rs index 96a11fe..2d918c9 100644 --- a/src/neon/to_sigmoidal.rs +++ b/src/neon/to_sigmoidal.rs @@ -7,7 +7,7 @@ use crate::image::ImageConfiguration; use crate::neon::sigmoidal::neon_rgb_to_sigmoidal; -use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half}; +use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter}; use std::arch::aarch64::*; #[inline(always)] @@ -244,5 +244,49 @@ pub unsafe fn neon_image_to_sigmoidal { + float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low) + } + ImageConfiguration::Bgra | ImageConfiguration::Bgr => { + float32x4x4_t(z_low_low, y_low_low, x_low_low, a_low_low) + } + }; + vst4q_f32(dst_ptr.add(cx * channels), store_rows); + } else { + let store_rows = match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Rgba => { + float32x4x3_t(x_low_low, y_low_low, z_low_low) + } + ImageConfiguration::Bgra | ImageConfiguration::Bgr => { + float32x4x3_t(z_low_low, y_low_low, x_low_low) + } + }; + vst3q_f32(dst_ptr.add(cx * channels), store_rows); + } + + cx += 4; + } + cx }