diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 72f698e..503b315 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -58,7 +58,7 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0f32); let src_stride = width * components as u32; let start_time = Instant::now(); - rgb_to_sigmoidal( + rgb_to_lch( src_bytes, src_stride, &mut lab_store, @@ -92,7 +92,7 @@ fn main() { // } let start_time = Instant::now(); - sigmoidal_to_rgb( + lch_to_rgb( &lab_store, store_stride as u32, &mut dst_slice, diff --git a/src/image_to_sigmoidal.rs b/src/image_to_sigmoidal.rs index 065a76a..303196b 100644 --- a/src/image_to_sigmoidal.rs +++ b/src/image_to_sigmoidal.rs @@ -113,13 +113,16 @@ fn image_to_sigmoidal( let px = x * channels; let src = unsafe { src_ptr.add(px) }; let r = unsafe { - src.add(image_configuration.get_r_channel_offset()).read_unaligned() + src.add(image_configuration.get_r_channel_offset()) + .read_unaligned() }; let g = unsafe { - src.add(image_configuration.get_g_channel_offset()).read_unaligned() + src.add(image_configuration.get_g_channel_offset()) + .read_unaligned() }; let b = unsafe { - src.add(image_configuration.get_b_channel_offset()).read_unaligned() + src.add(image_configuration.get_b_channel_offset()) + .read_unaligned() }; let rgb = Rgb::::new(r, g, b); @@ -135,7 +138,8 @@ fn image_to_sigmoidal( if image_configuration.has_alpha() { let a = unsafe { - src.add(image_configuration.get_a_channel_offset()).read_unaligned() + src.add(image_configuration.get_a_channel_offset()) + .read_unaligned() } as f32 * COLOR_SCALE; diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs index 95143d4..f73cf32 100644 --- a/src/image_to_xyz_lab.rs +++ b/src/image_to_xyz_lab.rs @@ -5,7 +5,6 @@ use crate::avx::avx2_image_to_xyz_lab; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ}; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" @@ -16,27 +15,9 @@ use crate::neon::neon_channels_to_xyz_or_lab; target_feature = "sse4.1" ))] use crate::sse::sse_channels_to_xyz_or_lab; +use crate::xyz_target::XyzTarget; use crate::{Rgb, Xyz, SRGB_TO_XYZ_D65}; -pub(crate) enum XyzTarget { - LAB = 0, - XYZ = 1, - LUV = 2, -} - -impl From for XyzTarget { - fn from(value: u8) -> Self { - match value { - 0 => LAB, - 1 => XYZ, - 2 => LUV, - _ => { - panic!("Not implemented") - } - } - } -} - #[inline(always)] fn channels_to_xyz( src: &[u8], @@ -95,74 +76,76 @@ fn channels_to_xyz( - cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - a_channel.as_mut_ptr(), - a_offset, - &matrix, - transfer_function, - ); - } else { - cx = avx2_image_to_xyz_lab::( - cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - std::ptr::null_mut(), - 0usize, - &matrix, - transfer_function, - ); + if target != XyzTarget::LCH { + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" + ))] + unsafe { + if _has_avx2 { + if USE_ALPHA { + cx = avx2_image_to_xyz_lab::( + cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + a_channel.as_mut_ptr(), + a_offset, + &matrix, + transfer_function, + ); + } else { + cx = avx2_image_to_xyz_lab::( + cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + std::ptr::null_mut(), + 0usize, + &matrix, + transfer_function, + ); + } } } - } - #[cfg(all( - any(target_arch = "x86_64", target_arch = "x86"), - target_feature = "sse4.1" - ))] - unsafe { - if _has_sse { - if USE_ALPHA { - cx = sse_channels_to_xyz_or_lab::( - cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - a_channel.as_mut_ptr(), - a_offset, - &matrix, - transfer_function, - ) - } else { - cx = sse_channels_to_xyz_or_lab::( - cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - std::ptr::null_mut(), - 0usize, - &matrix, - transfer_function, - ) + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + if USE_ALPHA { + cx = sse_channels_to_xyz_or_lab::( + cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + a_channel.as_mut_ptr(), + a_offset, + &matrix, + transfer_function, + ) + } else { + cx = sse_channels_to_xyz_or_lab::( + cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + std::ptr::null_mut(), + 0usize, + &matrix, + transfer_function, + ) + } } } } @@ -221,20 +204,19 @@ fn channels_to_xyz::new(r, g, b); + let ptr = unsafe { dst_ptr.add(x * 3) }; match target { - LAB => { + XyzTarget::LAB => { let lab = rgb.to_lab(); unsafe { - let ptr = dst_ptr.add(x * 3); ptr.write_unaligned(lab.l); ptr.add(1).write_unaligned(lab.a); ptr.add(2).write_unaligned(lab.b); } } - XYZ => { + XyzTarget::XYZ => { let xyz = Xyz::from_rgb(&rgb, &matrix, transfer_function); unsafe { - let ptr = dst_ptr.add(x * 3); ptr.write_unaligned(xyz.x); ptr.add(1).write_unaligned(xyz.y); ptr.add(2).write_unaligned(xyz.z); @@ -243,12 +225,19 @@ fn channels_to_xyz { let luv = rgb.to_luv(); unsafe { - let ptr = dst_ptr.add(x * 3); ptr.write_unaligned(luv.l); ptr.add(1).write_unaligned(luv.u); ptr.add(2).write_unaligned(luv.v); } } + XyzTarget::LCH => { + let lch = rgb.to_lch(); + unsafe { + ptr.write_unaligned(lch.l); + ptr.add(1).write_unaligned(lch.c); + ptr.add(2).write_unaligned(lch.h); + } + } } if USE_ALPHA && image_configuration.has_alpha() { @@ -293,7 +282,7 @@ pub fn rgb_to_xyz( transfer_function: TransferFunction, ) { let mut empty_vec = vec![]; - channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { XYZ as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::XYZ as u8 }>( src, src_stride, dst, @@ -327,7 +316,7 @@ pub fn srgb_to_xyz( height: u32, ) { let mut empty_vec = vec![]; - channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { XYZ as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::XYZ as u8 }>( src, src_stride, dst, @@ -359,7 +348,7 @@ pub fn rgb_to_lab( height: u32, ) { let mut empty_vec = vec![]; - channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { LAB as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -395,7 +384,7 @@ pub fn rgba_to_xyz( transfer_function: TransferFunction, ) { let mut empty_vec = vec![]; - channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, false, { XYZ as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, false, { XyzTarget::XYZ as u8 }>( src, src_stride, dst, @@ -429,7 +418,7 @@ pub fn srgba_to_xyz( height: u32, ) { let mut empty_vec = vec![]; - channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, false, { XYZ as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, false, { XyzTarget::XYZ as u8 }>( src, src_stride, dst, @@ -468,7 +457,7 @@ pub fn rgba_to_xyza( matrix: &[[f32; 3]; 3], transfer_function: TransferFunction, ) { - channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, true, { XYZ as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, true, { XyzTarget::XYZ as u8 }>( src, src_stride, dst, @@ -503,7 +492,7 @@ pub fn srgba_to_xyza( width: u32, height: u32, ) { - channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, true, { XYZ as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, true, { XyzTarget::XYZ as u8 }>( src, src_stride, dst, @@ -538,7 +527,7 @@ pub fn rgba_to_lab( width: u32, height: u32, ) { - channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, false, { LAB as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, false, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -573,7 +562,7 @@ pub fn rgba_to_laba( width: u32, height: u32, ) { - channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, true, { LAB as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgba as u8 }, true, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -606,7 +595,7 @@ pub fn bgra_to_laba( width: u32, height: u32, ) { - channels_to_xyz::<{ ImageConfiguration::Bgra as u8 }, true, { LAB as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Bgra as u8 }, true, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -638,7 +627,7 @@ pub fn bgr_to_lab( height: u32, ) { let mut empty_vec = vec![]; - channels_to_xyz::<{ ImageConfiguration::Bgr as u8 }, false, { LAB as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Bgr as u8 }, false, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -670,7 +659,7 @@ pub fn rgb_to_luv( height: u32, ) { let mut empty_vec = vec![]; - channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { LUV as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::LUV as u8 }>( src, src_stride, dst, @@ -702,7 +691,71 @@ pub fn bgr_to_luv( height: u32, ) { let mut empty_vec = vec![]; - channels_to_xyz::<{ ImageConfiguration::Bgr as u8 }, false, { LUV as u8 }>( + channels_to_xyz::<{ ImageConfiguration::Bgr as u8 }, false, { XyzTarget::LUV as u8 }>( + src, + src_stride, + dst, + dst_stride, + &mut empty_vec, + 0, + width, + height, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts RGB to CIE L\*C\*h against D65 white point. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive LAB data +/// * `dst_stride` - Bytes per row for dst data +pub fn rgb_to_lch( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, +) { + let mut empty_vec = vec![]; + channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::LCH as u8 }>( + src, + src_stride, + dst, + dst_stride, + &mut empty_vec, + 0, + width, + height, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts BGR to CIE L\*C\*h against D65 white point. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive LAB data +/// * `dst_stride` - Bytes per row for dst data +pub fn bgr_to_lch( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, +) { + let mut empty_vec = vec![]; + channels_to_xyz::<{ ImageConfiguration::Bgr as u8 }, false, { XyzTarget::LCH as u8 }>( src, src_stride, dst, diff --git a/src/image_xyza_laba.rs b/src/image_xyza_laba.rs index ee89e9f..3e634a4 100644 --- a/src/image_xyza_laba.rs +++ b/src/image_xyza_laba.rs @@ -1,6 +1,4 @@ use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget; -use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ}; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" @@ -11,6 +9,7 @@ use crate::neon::neon_channels_to_xyza_or_laba; target_feature = "sse4.1" ))] use crate::sse::sse_channels_to_xyza_laba; +use crate::xyz_target::XyzTarget; use crate::{Rgb, TransferFunction, Xyz, SRGB_TO_XYZ_D65}; #[inline(always)] @@ -57,22 +56,24 @@ fn channels_to_xyz_with_alpha( - cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - &matrix, - transfer_function, - ); + if target != XyzTarget::LCH { + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + cx = sse_channels_to_xyza_laba::( + cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + &matrix, + transfer_function, + ); + } } } @@ -116,7 +117,7 @@ fn channels_to_xyz_with_alpha { + XyzTarget::LAB => { let lab = rgb.to_lab(); unsafe { dst_store.write_unaligned(lab.l); @@ -124,7 +125,7 @@ fn channels_to_xyz_with_alpha { + XyzTarget::XYZ => { let xyz = Xyz::from_rgb(&rgb, &matrix, transfer_function); unsafe { dst_store.write_unaligned(xyz.x); @@ -140,6 +141,14 @@ fn channels_to_xyz_with_alpha { + let lch = rgb.to_lch(); + unsafe { + dst_store.write_unaligned(lch.l); + dst_store.add(1).write_unaligned(lch.c); + dst_store.add(2).write_unaligned(lch.h); + } + } } let a = unsafe { @@ -176,7 +185,7 @@ pub fn rgba_to_lab_with_alpha( width: u32, height: u32, ) { - channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { LAB as u8 }>( + channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -205,7 +214,7 @@ pub fn bgra_to_lab_with_alpha( width: u32, height: u32, ) { - channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { LAB as u8 }>( + channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -234,7 +243,7 @@ pub fn rgba_to_luv_with_alpha( width: u32, height: u32, ) { - channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { LUV as u8 }>( + channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::LUV as u8 }>( src, src_stride, dst, @@ -263,7 +272,7 @@ pub fn bgra_to_luv_with_alpha( width: u32, height: u32, ) { - channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { LUV as u8 }>( + channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::LUV as u8 }>( src, src_stride, dst, @@ -292,7 +301,7 @@ pub fn rgba_to_xyz_with_alpha( width: u32, height: u32, ) { - channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { XYZ as u8 }>( + channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::XYZ as u8 }>( src, src_stride, dst, @@ -313,8 +322,6 @@ pub fn rgba_to_xyz_with_alpha( /// * `height` - Image height /// * `dst` - A mutable slice to receive XYZ data /// * `dst_stride` - Bytes per row for dst data -/// * `a_plane` - A mutable slice to receive XYZ data -/// * `a_stride` - Bytes per row for dst data pub fn bgra_to_xyz_with_alpha( src: &[u8], src_stride: u32, @@ -323,7 +330,65 @@ pub fn bgra_to_xyz_with_alpha( width: u32, height: u32, ) { - channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { XYZ as u8 }>( + channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::XYZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts RGBA to CIE LCH against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGBA data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive LCH(a) data +/// * `dst_stride` - Bytes per row for dst data +pub fn rgba_to_lch_with_alpha( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, +) { + channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::LCH as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts BGRA to CIE LCH against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains BGRA data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive LCH data +/// * `dst_stride` - Bytes per row for dst data +pub fn bgra_to_lch_with_alpha( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, +) { + channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::LCH as u8 }>( src, src_stride, dst, diff --git a/src/lib.rs b/src/lib.rs index 8920cf0..99ecd51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,6 +37,7 @@ mod sigmoidal_to_image; mod sse; mod xyz; mod xyz_lab_to_image; +mod xyz_target; mod xyz_transform; mod xyza_laba_to_image; @@ -49,9 +50,11 @@ pub use image_to_hsv::*; pub use image_to_linear::*; pub use image_to_linear_u8::*; pub use image_to_xyz_lab::bgr_to_lab; +pub use image_to_xyz_lab::bgr_to_lch; pub use image_to_xyz_lab::bgr_to_luv; pub use image_to_xyz_lab::bgra_to_laba; pub use image_to_xyz_lab::rgb_to_lab; +pub use image_to_xyz_lab::rgb_to_lch; pub use image_to_xyz_lab::rgb_to_luv; pub use image_to_xyz_lab::rgb_to_xyz; pub use image_to_xyz_lab::rgba_to_lab; @@ -62,9 +65,11 @@ pub use image_to_xyz_lab::srgb_to_xyz; pub use image_to_xyz_lab::srgba_to_xyz; pub use image_to_xyz_lab::srgba_to_xyza; pub use image_xyza_laba::bgra_to_lab_with_alpha; +pub use image_xyza_laba::bgra_to_lch_with_alpha; pub use image_xyza_laba::bgra_to_luv_with_alpha; pub use image_xyza_laba::bgra_to_xyz_with_alpha; pub use image_xyza_laba::rgba_to_lab_with_alpha; +pub use image_xyza_laba::rgba_to_lch_with_alpha; pub use image_xyza_laba::rgba_to_luv_with_alpha; pub use image_xyza_laba::rgba_to_xyz_with_alpha; pub use lab::Lab; @@ -84,6 +89,8 @@ pub use rgba::ToRgbaF32; pub use xyz::Xyz; pub use xyz_lab_to_image::lab_to_srgb; pub use xyz_lab_to_image::laba_to_srgb; +pub use xyz_lab_to_image::lch_to_bgr; +pub use xyz_lab_to_image::lch_to_rgb; pub use xyz_lab_to_image::luv_to_bgr; pub use xyz_lab_to_image::luv_to_rgb; pub use xyz_lab_to_image::xyz_to_rgb; @@ -92,6 +99,8 @@ pub use xyz_lab_to_image::xyza_to_rgba; pub use xyz_transform::*; pub use xyza_laba_to_image::lab_with_alpha_to_bgra; pub use xyza_laba_to_image::lab_with_alpha_to_rgba; +pub use xyza_laba_to_image::lch_with_alpha_to_bgra; +pub use xyza_laba_to_image::lch_with_alpha_to_rgba; pub use xyza_laba_to_image::luv_with_alpha_to_bgra; pub use xyza_laba_to_image::luv_with_alpha_to_rgba; pub use xyza_laba_to_image::xyz_with_alpha_to_bgra; diff --git a/src/luv.rs b/src/luv.rs index dfd527b..62bb2a9 100644 --- a/src/luv.rs +++ b/src/luv.rs @@ -89,7 +89,6 @@ impl Luv { Luv::from_rgb(&rgba.to_rgb()) } - #[allow(dead_code)] pub fn to_rgb(&self) -> Rgb { if self.l <= 0f32 { return Xyz::new(0f32, 0f32, 0f32).to_srgb(); diff --git a/src/neon/cie.rs b/src/neon/cie.rs new file mode 100644 index 0000000..94cd620 --- /dev/null +++ b/src/neon/cie.rs @@ -0,0 +1,183 @@ +use crate::luv::{ + LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y, LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME, + LUV_WHITE_V_PRIME, +}; +use crate::neon::math::{ + prefer_vfmaq_f32, vatan2q_f32, vcbrtq_f32, vcolorq_matrix_f32, vcosq_f32, vcubeq_f32, + vhypotq_f32, vsinq_f32, +}; +use std::arch::aarch64::*; + +#[inline(always)] +pub(crate) unsafe fn neon_triple_to_xyz( + r: uint32x4_t, + g: uint32x4_t, + b: uint32x4_t, + c1: float32x4_t, + c2: float32x4_t, + c3: float32x4_t, + c4: float32x4_t, + c5: float32x4_t, + c6: float32x4_t, + c7: float32x4_t, + c8: float32x4_t, + c9: float32x4_t, + transfer: &unsafe fn(float32x4_t) -> float32x4_t, +) -> (float32x4_t, float32x4_t, float32x4_t) { + let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32); + let g_f = vmulq_n_f32(vcvtq_f32_u32(g), 1f32 / 255f32); + let b_f = vmulq_n_f32(vcvtq_f32_u32(b), 1f32 / 255f32); + let r_linear = transfer(r_f); + let g_linear = transfer(g_f); + let b_linear = transfer(b_f); + + let (x, y, z) = vcolorq_matrix_f32( + r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9, + ); + (x, y, z) +} + +#[inline(always)] +pub(crate) unsafe fn neon_triple_to_luv( + x: float32x4_t, + y: float32x4_t, + z: float32x4_t, +) -> (float32x4_t, float32x4_t, float32x4_t) { + let zeros = vdupq_n_f32(0f32); + let den = prefer_vfmaq_f32( + prefer_vfmaq_f32(x, z, vdupq_n_f32(3f32)), + y, + vdupq_n_f32(15f32), + ); + let nan_mask = vceqzq_f32(den); + let l_low_mask = vcltq_f32(y, vdupq_n_f32(LUV_CUTOFF_FORWARD_Y)); + let y_cbrt = vcbrtq_f32(y); + let l = vbslq_f32( + l_low_mask, + vmulq_n_f32(y, LUV_MULTIPLIER_FORWARD_Y), + prefer_vfmaq_f32(vdupq_n_f32(-16f32), y_cbrt, vdupq_n_f32(116f32)), + ); + let u_prime = vdivq_f32(vmulq_n_f32(x, 4f32), den); + let v_prime = vdivq_f32(vmulq_n_f32(y, 9f32), den); + let sub_u_prime = vsubq_f32(u_prime, vdupq_n_f32(crate::luv::LUV_WHITE_U_PRIME)); + let sub_v_prime = vsubq_f32(v_prime, vdupq_n_f32(crate::luv::LUV_WHITE_V_PRIME)); + let l13 = vmulq_n_f32(l, 13f32); + let u = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_u_prime)); + let v = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_v_prime)); + (l, u, v) +} + +#[inline(always)] +pub(crate) unsafe fn neon_triple_to_lab( + x: float32x4_t, + y: float32x4_t, + z: float32x4_t, +) -> (float32x4_t, float32x4_t, float32x4_t) { + let x = vmulq_n_f32(x, 100f32 / 95.047f32); + let z = vmulq_n_f32(z, 100f32 / 108.883f32); + let cbrt_x = vcbrtq_f32(x); + let cbrt_y = vcbrtq_f32(y); + let cbrt_z = vcbrtq_f32(z); + let s_1 = vdupq_n_f32(16f32 / 116f32); + let s_2 = vdupq_n_f32(7.787f32); + let lower_x = prefer_vfmaq_f32(s_1, s_2, x); + let lower_y = prefer_vfmaq_f32(s_1, s_2, y); + let lower_z = prefer_vfmaq_f32(s_1, s_2, z); + let kappa = vdupq_n_f32(0.008856f32); + let x = vbslq_f32(vcgtq_f32(x, kappa), cbrt_x, lower_x); + let y = vbslq_f32(vcgtq_f32(y, kappa), cbrt_y, lower_y); + let z = vbslq_f32(vcgtq_f32(z, kappa), cbrt_z, lower_z); + let l = prefer_vfmaq_f32(vdupq_n_f32(-16.0f32), y, vdupq_n_f32(116.0f32)); + let a = vmulq_n_f32(vsubq_f32(x, y), 500f32); + let b = vmulq_n_f32(vsubq_f32(y, z), 200f32); + (l, a, b) +} + +#[inline(always)] +pub(crate) unsafe fn neon_triple_to_lch( + x: float32x4_t, + y: float32x4_t, + z: float32x4_t, +) -> (float32x4_t, float32x4_t, float32x4_t) { + let (luv_l, luv_u, luv_v) = neon_triple_to_luv(x, y, z); + let lch_c = vhypotq_f32(luv_u, luv_v); + let lch_h = vatan2q_f32(luv_v, luv_u); + (luv_l, lch_c, lch_h) +} + +#[inline(always)] +pub(crate) unsafe fn neon_luv_to_xyz( + l: float32x4_t, + u: float32x4_t, + v: float32x4_t, +) -> (float32x4_t, float32x4_t, float32x4_t) { + let zero_mask = vclezq_f32(l); + let zeros = vdupq_n_f32(0f32); + let l13 = vrecpeq_f32(vmulq_n_f32(l, 13f32)); + let u = prefer_vfmaq_f32(vdupq_n_f32(LUV_WHITE_U_PRIME), l13, u); + let v = prefer_vfmaq_f32(vdupq_n_f32(LUV_WHITE_V_PRIME), l13, v); + let l_h = vmulq_n_f32(vaddq_f32(l, vdupq_n_f32(16f32)), 1f32 / 116f32); + let y_high = vmulq_f32(vmulq_f32(l_h, l_h), l_h); + let y_low = vmulq_n_f32(l, LUV_MULTIPLIER_INVERSE_Y); + let y = vbslq_f32( + zero_mask, + zeros, + vbslq_f32(vcgtq_f32(l, vdupq_n_f32(8f32)), y_high, y_low), + ); + let zero_mask_2 = vclezq_f32(v); + let den = vrecpeq_f32(vmulq_n_f32(v, 4f32)); + let mut x = vmulq_n_f32(vmulq_f32(vmulq_f32(y, u), den), 9f32); + x = vbslq_f32(zero_mask, zeros, x); + x = vbslq_f32(zero_mask_2, zeros, x); + let mut z = vmulq_f32( + vmulq_f32( + prefer_vfmaq_f32( + prefer_vfmaq_f32(vdupq_n_f32(12f32), vdupq_n_f32(-3f32), u), + v, + vdupq_n_f32(-20f32), + ), + y, + ), + den, + ); + z = vbslq_f32(zero_mask, zeros, z); + z = vbslq_f32(zero_mask_2, zeros, z); + (x, y, z) +} + +#[inline(always)] +pub(crate) unsafe fn neon_lab_to_xyz( + l: float32x4_t, + a: float32x4_t, + b: float32x4_t, +) -> (float32x4_t, float32x4_t, float32x4_t) { + let y = vmulq_n_f32(vaddq_f32(l, vdupq_n_f32(16f32)), 1f32 / 116f32); + let x = vaddq_f32(vmulq_n_f32(a, 1f32 / 500f32), y); + let z = vsubq_f32(y, vmulq_n_f32(b, 1f32 / 200f32)); + let x3 = vcubeq_f32(x); + let y3 = vcubeq_f32(y); + let z3 = vcubeq_f32(z); + let kappa = vdupq_n_f32(0.008856f32); + let k_sub = vdupq_n_f32(16f32 / 116f32); + let low_x = vmulq_n_f32(vsubq_f32(x, k_sub), 1f32 / 7.787f32); + let low_y = vmulq_n_f32(vsubq_f32(y, k_sub), 1f32 / 7.787f32); + let low_z = vmulq_n_f32(vsubq_f32(z, k_sub), 1f32 / 7.787f32); + + let x = vbslq_f32(vcgtq_f32(x3, kappa), x3, low_x); + let y = vbslq_f32(vcgtq_f32(y3, kappa), y3, low_y); + let z = vbslq_f32(vcgtq_f32(z3, kappa), z3, low_z); + let x = vmulq_n_f32(x, 95.047f32 / 100f32); + let z = vmulq_n_f32(z, 108.883f32 / 100f32); + (x, y, z) +} + +#[inline(always)] +pub(crate) unsafe fn neon_lch_to_xyz( + l: float32x4_t, + c: float32x4_t, + h: float32x4_t, +) -> (float32x4_t, float32x4_t, float32x4_t) { + let u = vmulq_f32(c, vcosq_f32(h)); + let v = vmulq_f32(c, vsinq_f32(h)); + neon_luv_to_xyz(l, u, v) +} diff --git a/src/neon/hsv_to_image.rs b/src/neon/hsv_to_image.rs index be1986c..1a38af4 100644 --- a/src/neon/hsv_to_image.rs +++ b/src/neon/hsv_to_image.rs @@ -4,10 +4,6 @@ use crate::image::ImageConfiguration; use crate::image_to_hsv_support::HsvTarget; use crate::neon::{neon_hsl_to_rgb, neon_hsv_to_rgb}; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] #[inline] pub unsafe fn neon_hsv_u16_to_image< const CHANNELS_CONFIGURATION: u8, diff --git a/src/neon/image_to_hsv.rs b/src/neon/image_to_hsv.rs index 8edd232..958eb9a 100644 --- a/src/neon/image_to_hsv.rs +++ b/src/neon/image_to_hsv.rs @@ -3,10 +3,6 @@ use crate::image_to_hsv_support::HsvTarget; use crate::neon::{neon_rgb_to_hsl, neon_rgb_to_hsv}; use std::arch::aarch64::*; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] #[inline(always)] #[allow(dead_code)] pub unsafe fn neon_channels_to_hsv< diff --git a/src/neon/math.rs b/src/neon/math.rs index abd73a1..7c69f48 100644 --- a/src/neon/math.rs +++ b/src/neon/math.rs @@ -1,4 +1,3 @@ -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use std::arch::aarch64::*; #[inline(always)] @@ -476,3 +475,172 @@ pub unsafe fn vcolorq_matrix_f32( let new_b = prefer_vfmaq_f32(prefer_vfmaq_f32(vmulq_f32(g, c8), b, c9), r, c7); (new_r, new_g, new_b) } + +#[inline(always)] +pub unsafe fn vhypotq_f32(x: float32x4_t, y: float32x4_t) -> float32x4_t { + let xp2 = vmulq_f32(x, x); + let yp2 = vmulq_f32(y, y); + let z = vaddq_f32(xp2, yp2); + return vsqrtq_f32(z); +} + +#[inline(always)] +pub unsafe fn vpoly4q_f32( + x: float32x4_t, + x2: float32x4_t, + c3: float32x4_t, + c2: float32x4_t, + c1: float32x4_t, + c0: float32x4_t, +) -> float32x4_t { + vmlafq_f32(x2, vmlafq_f32(x, c3, c2), vmlafq_f32(x, c1, c0)) +} + +#[inline(always)] +pub unsafe fn vpoly8q_f32( + x: float32x4_t, + x2: float32x4_t, + x4: float32x4_t, + c7: float32x4_t, + c6: float32x4_t, + c5: float32x4_t, + c4: float32x4_t, + c3: float32x4_t, + c2: float32x4_t, + c1: float32x4_t, + c0: float32x4_t, +) -> float32x4_t { + vmlafq_f32( + x4, + vpoly4q_f32(x, x2, c7, c6, c5, c4), + vpoly4q_f32(x, x2, c3, c2, c1, c0), + ) +} + +#[inline(always)] +unsafe fn vatan2q_f32_impl(y: float32x4_t, x: float32x4_t) -> float32x4_t { + let q = vbslq_s32(vcltzq_f32(x), vdupq_n_s32(-2), vdupq_n_s32(0)); + let x = vabsq_f32(x); + let is_y_more_than_x = vcgtq_f32(y, x); + let t = vbslq_f32(is_y_more_than_x, x, vdupq_n_f32(0f32)); + let x = vbslq_f32(is_y_more_than_x, y, x); + let y = vbslq_f32(is_y_more_than_x, vnegq_f32(t), y); + let q = vbslq_s32(is_y_more_than_x, vaddq_s32(q, vdupq_n_s32(1)), q); + let s = vdivq_f32(y, x); + let t = vmulq_f32(s, s); + let t2 = vmulq_f32(t, t); + let t4 = vmulq_f32(t2, t2); + let poly = vpoly8q_f32( + t, + t2, + t4, + vdupq_n_f32(0.00282363896258175373077393f32), + vdupq_n_f32(-0.0159569028764963150024414f32), + vdupq_n_f32(0.0425049886107444763183594f32), + vdupq_n_f32(-0.0748900920152664184570312f32), + vdupq_n_f32(0.106347933411598205566406f32), + vdupq_n_f32(-0.142027363181114196777344f32), + vdupq_n_f32(0.199926957488059997558594f32), + vdupq_n_f32(-0.333331018686294555664062f32), + ); + let t = prefer_vfmaq_f32(s, vmulq_f32(poly, t), s); + let t = prefer_vfmaq_f32( + t, + vcvtq_f32_s32(q), + vdupq_n_f32(std::f32::consts::FRAC_PI_2), + ); + t +} + +#[inline(always)] +pub unsafe fn vatan2q_f32(y: float32x4_t, x: float32x4_t) -> float32x4_t { + let r = vatan2q_f32_impl(vabsq_f32(y), x); + let r = vmulsignq_f32(r, x); + vmulsignq_f32(r, y) +} + +#[inline(always)] +pub unsafe fn vsinq_f32(val: float32x4_t) -> float32x4_t { + let pi_v = vdupq_n_f32(std::f32::consts::PI); + let pio2_v = vdupq_n_f32(std::f32::consts::FRAC_PI_2); + let ipi_v = vdupq_n_f32(std::f32::consts::FRAC_1_PI); + + //Find positive or negative + let c_v = vabsq_s32(vcvtq_s32_f32(vmulq_f32(val, ipi_v))); + let sign_v = vcleq_f32(val, vdupq_n_f32(0f32)); + let odd_v = vandq_u32(vreinterpretq_u32_s32(c_v), vdupq_n_u32(1)); + + let neg_v = veorq_u32(odd_v, sign_v); + + //Modulus a - (n * int(a*(1/n))) + let mut ma = vsubq_f32(vabsq_f32(val), vmulq_f32(pi_v, vcvtq_f32_s32(c_v))); + let reb_v = vcgeq_f32(ma, pio2_v); + + //Rebase a between 0 and pi/2 + ma = vbslq_f32(reb_v, vsubq_f32(pi_v, ma), ma); + + //Taylor series + let ma2 = vmulq_f32(ma, ma); + + //2nd elem: x^3 / 3! + let mut elem = vmulq_f32(vmulq_f32(ma, ma2), vdupq_n_f32(0.166666666666f32)); + let mut res = vsubq_f32(ma, elem); + + //3rd elem: x^5 / 5! + elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(0.05f32)); + res = vaddq_f32(res, elem); + + //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val) + elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(0.023809523810f32)); + res = vsubq_f32(res, elem); + + //5th elem: x^9 / 9! + elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(0.013888888889f32)); + res = vaddq_f32(res, elem); + + //Change of sign + let neg_v = vshlq_n_u32::<31>(neg_v); + res = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(res), neg_v)); + return res; +} + +#[inline(always)] +pub unsafe fn vcosq_f32(d: float32x4_t) -> float32x4_t { + let mut q = vcvtq_s32_f32(vsubq_f32( + vmulq_f32(d, vdupq_n_f32(std::f32::consts::FRAC_1_PI)), + vdupq_n_f32(0.5f32), + )); + + q = vaddq_s32(vaddq_s32(q, q), vdupq_n_s32(1)); + + let mut u = vcvtq_f32_s32(q); + let mut d = vmlafq_f32(u, vdupq_n_f32(-0.78515625f32 * 2f32), d); + d = vmlafq_f32(u, vdupq_n_f32(-0.00024187564849853515625f32 * 2f32), d); + d = vmlafq_f32(u, vdupq_n_f32(-3.7747668102383613586e-08f32 * 2f32), d); + d = vmlafq_f32(u, vdupq_n_f32(-1.2816720341285448015e-12f32 * 2f32), d); + + let s = vmulq_f32(d, d); + + d = vreinterpretq_f32_u32(veorq_u32( + vandq_u32( + vceqq_s32(vandq_s32(q, vdupq_n_s32(2)), vdupq_n_s32(0)), + vreinterpretq_u32_f32(vdupq_n_f32(-0.0f32)), + ), + vreinterpretq_u32_f32(d), + )); + + u = vdupq_n_f32(2.6083159809786593541503e-06f32); + u = vmlafq_f32(u, s, vdupq_n_f32(-0.0001981069071916863322258f32)); + u = vmlafq_f32(u, s, vdupq_n_f32(0.00833307858556509017944336f32)); + u = vmlafq_f32(u, s, vdupq_n_f32(-0.166666597127914428710938f32)); + + u = vmlafq_f32(s, vmulq_f32(u, d), d); + + u = vreinterpretq_f32_u32(vorrq_u32(vispinfq_f32(d), vreinterpretq_u32_f32(u))); + return u; +} + +#[inline(always)] +pub(crate) unsafe fn vcubeq_f32(x: float32x4_t) -> float32x4_t { + vmulq_f32(vmulq_f32(x, x), x) +} diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 2fca0d5..78d1949 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -1,3 +1,4 @@ +mod cie; mod colors; mod from_sigmoidal; mod gamma_curves; diff --git a/src/neon/to_linear.rs b/src/neon/to_linear.rs index e3701c2..d7ebd22 100644 --- a/src/neon/to_linear.rs +++ b/src/neon/to_linear.rs @@ -1,4 +1,3 @@ -#[allow(unused_imports)] use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; use crate::neon::*; diff --git a/src/neon/to_linear_u8.rs b/src/neon/to_linear_u8.rs index e5e80fc..0202595 100644 --- a/src/neon/to_linear_u8.rs +++ b/src/neon/to_linear_u8.rs @@ -1,7 +1,3 @@ -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] pub mod neon_image_linear_to_u8 { use crate::image::ImageConfiguration; use std::arch::aarch64::*; diff --git a/src/neon/to_xyz_lab.rs b/src/neon/to_xyz_lab.rs index 0098868..5d2489a 100644 --- a/src/neon/to_xyz_lab.rs +++ b/src/neon/to_xyz_lab.rs @@ -1,121 +1,12 @@ -#[allow(unused_imports)] use crate::gamma_curves::TransferFunction; -#[allow(unused_imports)] use crate::image::ImageConfiguration; -#[allow(unused_imports)] -use crate::image_to_xyz_lab::XyzTarget; -#[allow(unused_imports)] -use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y}; -use crate::neon::math::*; -#[allow(unused_imports)] +use crate::neon::cie::{ + neon_triple_to_lab, neon_triple_to_lch, neon_triple_to_luv, neon_triple_to_xyz, +}; use crate::neon::*; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] +use crate::xyz_target::XyzTarget; use std::arch::aarch64::*; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -#[inline(always)] -pub(crate) unsafe fn neon_triple_to_xyz( - r: uint32x4_t, - g: uint32x4_t, - b: uint32x4_t, - c1: float32x4_t, - c2: float32x4_t, - c3: float32x4_t, - c4: float32x4_t, - c5: float32x4_t, - c6: float32x4_t, - c7: float32x4_t, - c8: float32x4_t, - c9: float32x4_t, - transfer: &unsafe fn(float32x4_t) -> float32x4_t, -) -> (float32x4_t, float32x4_t, float32x4_t) { - let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32); - let g_f = vmulq_n_f32(vcvtq_f32_u32(g), 1f32 / 255f32); - let b_f = vmulq_n_f32(vcvtq_f32_u32(b), 1f32 / 255f32); - let r_linear = transfer(r_f); - let g_linear = transfer(g_f); - let b_linear = transfer(b_f); - - let (x, y, z) = vcolorq_matrix_f32( - r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9, - ); - (x, y, z) -} - -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -#[inline(always)] -pub(crate) unsafe fn neon_triple_to_luv( - x: float32x4_t, - y: float32x4_t, - z: float32x4_t, -) -> (float32x4_t, float32x4_t, float32x4_t) { - let zeros = vdupq_n_f32(0f32); - let den = prefer_vfmaq_f32( - prefer_vfmaq_f32(x, z, vdupq_n_f32(3f32)), - y, - vdupq_n_f32(15f32), - ); - let nan_mask = vceqzq_f32(den); - let l_low_mask = vcltq_f32(y, vdupq_n_f32(LUV_CUTOFF_FORWARD_Y)); - let y_cbrt = vcbrtq_f32(y); - let l = vbslq_f32( - l_low_mask, - vmulq_n_f32(y, LUV_MULTIPLIER_FORWARD_Y), - prefer_vfmaq_f32(vdupq_n_f32(-16f32), y_cbrt, vdupq_n_f32(116f32)), - ); - let u_prime = vdivq_f32(vmulq_n_f32(x, 4f32), den); - let v_prime = vdivq_f32(vmulq_n_f32(y, 9f32), den); - let sub_u_prime = vsubq_f32(u_prime, vdupq_n_f32(crate::luv::LUV_WHITE_U_PRIME)); - let sub_v_prime = vsubq_f32(v_prime, vdupq_n_f32(crate::luv::LUV_WHITE_V_PRIME)); - let l13 = vmulq_n_f32(l, 13f32); - let u = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_u_prime)); - let v = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_v_prime)); - (l, u, v) -} - -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -#[inline(always)] -pub(crate) unsafe fn neon_triple_to_lab( - x: float32x4_t, - y: float32x4_t, - z: float32x4_t, -) -> (float32x4_t, float32x4_t, float32x4_t) { - let x = vmulq_n_f32(x, 100f32 / 95.047f32); - let z = vmulq_n_f32(z, 100f32 / 108.883f32); - let cbrt_x = vcbrtq_f32(x); - let cbrt_y = vcbrtq_f32(y); - let cbrt_z = vcbrtq_f32(z); - let s_1 = vdupq_n_f32(16f32 / 116f32); - let s_2 = vdupq_n_f32(7.787f32); - let lower_x = prefer_vfmaq_f32(s_1, s_2, x); - let lower_y = prefer_vfmaq_f32(s_1, s_2, y); - let lower_z = prefer_vfmaq_f32(s_1, s_2, z); - let kappa = vdupq_n_f32(0.008856f32); - let x = vbslq_f32(vcgtq_f32(x, kappa), cbrt_x, lower_x); - let y = vbslq_f32(vcgtq_f32(y, kappa), cbrt_y, lower_y); - let z = vbslq_f32(vcgtq_f32(z, kappa), cbrt_z, lower_z); - let l = prefer_vfmaq_f32(vdupq_n_f32(-16.0f32), y, vdupq_n_f32(116.0f32)); - let a = vmulq_n_f32(vsubq_f32(x, y), 500f32); - let b = vmulq_n_f32(vsubq_f32(y, z), 200f32); - (l, a, b) -} - -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] #[inline(always)] pub unsafe fn neon_channels_to_xyz_or_lab< const CHANNELS_CONFIGURATION: u8, @@ -216,6 +107,12 @@ pub unsafe fn neon_channels_to_xyz_or_lab< y_low_low = u; z_low_low = v; } + XyzTarget::LCH => { + let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low); + x_low_low = l; + y_low_low = c; + z_low_low = h; + } } let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low); @@ -244,6 +141,12 @@ pub unsafe fn neon_channels_to_xyz_or_lab< y_low_high = u; z_low_high = v; } + XyzTarget::LCH => { + let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high); + x_low_high = l; + y_low_high = c; + z_low_high = h; + } } let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high); @@ -276,6 +179,12 @@ pub unsafe fn neon_channels_to_xyz_or_lab< y_high_low = u; z_high_low = v; } + XyzTarget::LCH => { + let (l, c, h) = neon_triple_to_lch(x_high_low, y_high_low, z_high_low); + x_high_low = l; + y_high_low = c; + z_high_low = h; + } } let xyz_low_low = float32x4x3_t(x_high_low, y_high_low, z_high_low); @@ -315,6 +224,12 @@ pub unsafe fn neon_channels_to_xyz_or_lab< y_high_high = u; z_high_high = v; } + XyzTarget::LCH => { + let (l, c, h) = neon_triple_to_lch(x_high_high, y_high_high, z_high_high); + x_high_high = l; + y_high_high = c; + z_high_high = h; + } } let xyz_low_low = float32x4x3_t(x_high_high, y_high_high, z_high_high); diff --git a/src/neon/to_xyza_laba.rs b/src/neon/to_xyza_laba.rs index 39fe53d..8cec0e5 100644 --- a/src/neon/to_xyza_laba.rs +++ b/src/neon/to_xyza_laba.rs @@ -1,25 +1,12 @@ -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -use std::arch::aarch64::*; - -#[allow(unused_imports)] use crate::gamma_curves::TransferFunction; -#[allow(unused_imports)] use crate::image::ImageConfiguration; -#[allow(unused_imports)] -use crate::image_to_xyz_lab::XyzTarget; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] +use crate::neon::cie::{ + neon_triple_to_lab, neon_triple_to_lch, neon_triple_to_luv, neon_triple_to_xyz, +}; use crate::neon::*; +use crate::xyz_target::XyzTarget; +use std::arch::aarch64::*; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] #[inline(always)] pub unsafe fn neon_channels_to_xyza_or_laba( start_cx: usize, @@ -103,6 +90,12 @@ pub unsafe fn neon_channels_to_xyza_or_laba {} + XyzTarget::LCH => { + let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low); + x_low_low = l; + y_low_low = c; + z_low_low = h; + } XyzTarget::LUV => { let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low); x_low_low = l; @@ -141,6 +134,12 @@ pub unsafe fn neon_channels_to_xyza_or_laba { + let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high); + x_low_high = l; + y_low_high = c; + z_low_high = h; + } } let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32); @@ -175,6 +174,12 @@ pub unsafe fn neon_channels_to_xyza_or_laba { + let (l, c, h) = neon_triple_to_lch(x_high_low, y_high_low, z_high_low); + x_high_low = l; + y_high_low = c; + z_high_low = h; + } } let a_high = vmovl_high_u8(a_chan); @@ -220,6 +225,12 @@ pub unsafe fn neon_channels_to_xyza_or_laba { + let (l, c, h) = neon_triple_to_lch(x_high_high, y_high_high, z_high_high); + x_high_high = l; + y_high_high = c; + z_high_high = h; + } } let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32); diff --git a/src/neon/xyz_lab_to_image.rs b/src/neon/xyz_lab_to_image.rs index f40d1ea..8041bfd 100644 --- a/src/neon/xyz_lab_to_image.rs +++ b/src/neon/xyz_lab_to_image.rs @@ -1,113 +1,11 @@ -#[allow(unused_imports)] use crate::image::ImageConfiguration; -#[allow(unused_imports)] -use crate::image_to_xyz_lab::XyzTarget; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -use crate::luv::*; +use crate::neon::cie::{neon_lab_to_xyz, neon_lch_to_xyz, neon_luv_to_xyz}; use crate::neon::math::*; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] use crate::neon::*; -#[allow(unused_imports)] +use crate::xyz_target::XyzTarget; use crate::TransferFunction; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] use std::arch::aarch64::*; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -#[inline(always)] -unsafe fn vcubeq_f32(x: float32x4_t) -> float32x4_t { - vmulq_f32(vmulq_f32(x, x), x) -} - -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -#[inline(always)] -pub(crate) unsafe fn neon_luv_to_xyz( - l: float32x4_t, - u: float32x4_t, - v: float32x4_t, -) -> (float32x4_t, float32x4_t, float32x4_t) { - let zero_mask = vclezq_f32(l); - let zeros = vdupq_n_f32(0f32); - let l13 = vrecpeq_f32(vmulq_n_f32(l, 13f32)); - let u = prefer_vfmaq_f32(vdupq_n_f32(LUV_WHITE_U_PRIME), l13, u); - let v = prefer_vfmaq_f32(vdupq_n_f32(LUV_WHITE_V_PRIME), l13, v); - let l_h = vmulq_n_f32(vaddq_f32(l, vdupq_n_f32(16f32)), 1f32 / 116f32); - let y_high = vmulq_f32(vmulq_f32(l_h, l_h), l_h); - let y_low = vmulq_n_f32(l, LUV_MULTIPLIER_INVERSE_Y); - let y = vbslq_f32( - zero_mask, - zeros, - vbslq_f32(vcgtq_f32(l, vdupq_n_f32(8f32)), y_high, y_low), - ); - let zero_mask_2 = vclezq_f32(v); - let den = vrecpeq_f32(vmulq_n_f32(v, 4f32)); - let mut x = vmulq_n_f32(vmulq_f32(vmulq_f32(y, u), den), 9f32); - x = vbslq_f32(zero_mask, zeros, x); - x = vbslq_f32(zero_mask_2, zeros, x); - let mut z = vmulq_f32( - vmulq_f32( - prefer_vfmaq_f32( - prefer_vfmaq_f32(vdupq_n_f32(12f32), vdupq_n_f32(-3f32), u), - v, - vdupq_n_f32(-20f32), - ), - y, - ), - den, - ); - z = vbslq_f32(zero_mask, zeros, z); - z = vbslq_f32(zero_mask_2, zeros, z); - (x, y, z) -} - -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -#[inline(always)] -pub(crate) unsafe fn neon_lab_to_xyz( - l: float32x4_t, - a: float32x4_t, - b: float32x4_t, -) -> (float32x4_t, float32x4_t, float32x4_t) { - let y = vmulq_n_f32(vaddq_f32(l, vdupq_n_f32(16f32)), 1f32 / 116f32); - let x = vaddq_f32(vmulq_n_f32(a, 1f32 / 500f32), y); - let z = vsubq_f32(y, vmulq_n_f32(b, 1f32 / 200f32)); - let x3 = vcubeq_f32(x); - let y3 = vcubeq_f32(y); - let z3 = vcubeq_f32(z); - let kappa = vdupq_n_f32(0.008856f32); - let k_sub = vdupq_n_f32(16f32 / 116f32); - let low_x = vmulq_n_f32(vsubq_f32(x, k_sub), 1f32 / 7.787f32); - let low_y = vmulq_n_f32(vsubq_f32(y, k_sub), 1f32 / 7.787f32); - let low_z = vmulq_n_f32(vsubq_f32(z, k_sub), 1f32 / 7.787f32); - - let x = vbslq_f32(vcgtq_f32(x3, kappa), x3, low_x); - let y = vbslq_f32(vcgtq_f32(y3, kappa), y3, low_y); - let z = vbslq_f32(vcgtq_f32(z3, kappa), z3, low_z); - let x = vmulq_n_f32(x, 95.047f32 / 100f32); - let z = vmulq_n_f32(z, 108.883f32 / 100f32); - (x, y, z) -} - -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] #[inline(always)] pub(crate) unsafe fn neon_xyz_lab_vld< const CHANNELS_CONFIGURATION: u8, @@ -145,6 +43,12 @@ pub(crate) unsafe fn neon_xyz_lab_vld< g_f32 = y; b_f32 = z; } + XyzTarget::LCH => { + let (x, y, z) = neon_lch_to_xyz(r_f32, g_f32, b_f32); + r_f32 = x; + g_f32 = y; + b_f32 = z; + } _ => {} } diff --git a/src/neon/xyza_laba_to_image.rs b/src/neon/xyza_laba_to_image.rs index d2b0aaf..876b8b2 100644 --- a/src/neon/xyza_laba_to_image.rs +++ b/src/neon/xyza_laba_to_image.rs @@ -1,33 +1,11 @@ -#[allow(unused_imports)] use crate::image::ImageConfiguration; -#[allow(unused_imports)] -use crate::image_to_xyz_lab::XyzTarget; +use crate::neon::cie::{neon_lab_to_xyz, neon_lch_to_xyz, neon_luv_to_xyz}; use crate::neon::math::vcolorq_matrix_f32; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] use crate::neon::*; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] -#[allow(unused_imports)] +use crate::xyz_target::XyzTarget; use crate::TransferFunction; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] use std::arch::aarch64::*; -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] #[inline(always)] pub(crate) unsafe fn neon_xyza_lab_vld( src: *const f32, @@ -61,6 +39,12 @@ pub(crate) unsafe fn neon_xyza_lab_vld { + let (x, y, z) = neon_lch_to_xyz(r_f32, g_f32, b_f32); + r_f32 = x; + g_f32 = y; + b_f32 = z; + } _ => {} } diff --git a/src/sse/math.rs b/src/sse/math.rs index 1d6bd7f..1642010 100644 --- a/src/sse/math.rs +++ b/src/sse/math.rs @@ -419,3 +419,48 @@ pub(crate) unsafe fn _mm_fmod_ps(a: __m128, b: __m128) -> __m128 { let remainder = _mm_sub_ps(dividend_vec, product); // Subtract the product from the dividend remainder } + +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn _mm_is_infinity(d: __m128) -> __m128 { + return _mm_cmpeq_ps(_mm_abs_ps(d), _mm_set1_ps(f32::INFINITY)); +} + +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn _mm_cos_ps(d: __m128) -> __m128 { + let mut q = _mm_cvtps_epi32(_mm_sub_ps( + _mm_mul_ps(d, _mm_set1_ps(std::f32::consts::FRAC_1_PI)), + _mm_set1_ps(0.5f32), + )); + + q = _mm_add_epi32(_mm_add_epi32(q, q), _mm_set1_epi32(1)); + + let mut u = _mm_cvtepi32_ps(q); + let mut d = _mm_fmaf_ps(u, _mm_set1_ps(-0.78515625f32 * 2f32), d); + d = _mm_fmaf_ps(u, _mm_set1_ps(-0.00024187564849853515625f32 * 2f32), d); + d = _mm_fmaf_ps(u, _mm_set1_ps(-3.7747668102383613586e-08f32 * 2f32), d); + d = _mm_fmaf_ps(u, _mm_set1_ps(-1.2816720341285448015e-12f32 * 2f32), d); + + let s = _mm_mul_ps(d, d); + + // TODO: Perform float masking instead + d = _mm_castsi128_ps(_mm_xor_si128( + _mm_and_si128( + _mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(2)), _mm_set1_epi32(0)), + _mm_castps_si128(_mm_set1_ps(-0.0f32)), + ), + _mm_castps_si128(d), + )); + + u = _mm_set1_ps(2.6083159809786593541503e-06f32); + u = _mm_fmaf_ps(u, s, _mm_set1_ps(-0.0001981069071916863322258f32)); + u = _mm_fmaf_ps(u, s, _mm_set1_ps(0.00833307858556509017944336f32)); + u = _mm_fmaf_ps(u, s, _mm_set1_ps(-0.166666597127914428710938f32)); + + u = _mm_fmaf_ps(s, _mm_mul_ps(u, d), d); + + u = _mm_or_ps(_mm_is_infinity(d), u); + + return u; +} diff --git a/src/xyz_lab_to_image.rs b/src/xyz_lab_to_image.rs index 5b59754..9bf747c 100644 --- a/src/xyz_lab_to_image.rs +++ b/src/xyz_lab_to_image.rs @@ -5,8 +5,6 @@ use crate::avx::avx_xyz_to_channels; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget; -use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ}; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" @@ -17,7 +15,8 @@ use crate::neon::neon_xyz_to_channels; target_feature = "sse4.1" ))] use crate::sse::sse_xyz_to_channels; -use crate::{Lab, Luv, Xyz, XYZ_TO_SRGB_D65}; +use crate::xyz_target::XyzTarget; +use crate::{LCh, Lab, Luv, Xyz, XYZ_TO_SRGB_D65}; fn xyz_to_channels( src: &[f32], @@ -77,74 +76,76 @@ fn xyz_to_channels( - cx, - src.as_ptr(), - src_offset, - a_channel.as_ptr(), - a_offset, - dst.as_mut_ptr(), - dst_offset, - width, - &matrix, - transfer_function, - ) - } else { - cx = avx_xyz_to_channels::( - cx, - src.as_ptr(), - src_offset, - std::ptr::null(), - 0usize, - dst.as_mut_ptr(), - dst_offset, - width, - &matrix, - transfer_function, - ) + if source != XyzTarget::LCH { + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" + ))] + unsafe { + if _has_avx2 { + if USE_ALPHA { + cx = avx_xyz_to_channels::( + cx, + src.as_ptr(), + src_offset, + a_channel.as_ptr(), + a_offset, + dst.as_mut_ptr(), + dst_offset, + width, + &matrix, + transfer_function, + ) + } else { + cx = avx_xyz_to_channels::( + cx, + src.as_ptr(), + src_offset, + std::ptr::null(), + 0usize, + dst.as_mut_ptr(), + dst_offset, + width, + &matrix, + transfer_function, + ) + } } } - } - #[cfg(all( - any(target_arch = "x86_64", target_arch = "x86"), - target_feature = "sse4.1" - ))] - unsafe { - if _has_sse { - if USE_ALPHA { - cx = sse_xyz_to_channels::( - cx, - src.as_ptr(), - src_offset, - a_channel.as_ptr(), - a_offset, - dst.as_mut_ptr(), - dst_offset, - width, - &matrix, - transfer_function, - ) - } else { - cx = sse_xyz_to_channels::( - cx, - src.as_ptr(), - src_offset, - std::ptr::null(), - 0usize, - dst.as_mut_ptr(), - dst_offset, - width, - &matrix, - transfer_function, - ) + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + if USE_ALPHA { + cx = sse_xyz_to_channels::( + cx, + src.as_ptr(), + src_offset, + a_channel.as_ptr(), + a_offset, + dst.as_mut_ptr(), + dst_offset, + width, + &matrix, + transfer_function, + ) + } else { + cx = sse_xyz_to_channels::( + cx, + src.as_ptr(), + src_offset, + std::ptr::null(), + 0usize, + dst.as_mut_ptr(), + dst_offset, + width, + &matrix, + transfer_function, + ) + } } } } @@ -193,11 +194,11 @@ fn xyz_to_channels { + XyzTarget::LAB => { let lab = Lab::new(l_x, l_y, l_z); rgb = lab.to_rgb(); } - XYZ => { + XyzTarget::XYZ => { let xyz = Xyz::new(l_x, l_y, l_z); rgb = xyz.to_rgb(&matrix, transfer_function); } @@ -205,6 +206,10 @@ fn xyz_to_channels { + let lch = LCh::new(l_x, l_y, l_z); + rgb = lch.to_rgb(); + } } let dst = unsafe { dst_ptr.add(x * channels) }; @@ -257,7 +262,7 @@ pub fn xyz_to_rgb( transfer_function: TransferFunction, ) { let empty_vec = vec![]; - xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { XYZ as u8 }>( + xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::XYZ as u8 }>( src, src_stride, &empty_vec, @@ -289,7 +294,7 @@ pub fn xyz_to_srgb( height: u32, ) { let empty_vec = vec![]; - xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { XYZ as u8 }>( + xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::XYZ as u8 }>( src, src_stride, &empty_vec, @@ -321,7 +326,7 @@ pub fn lab_to_srgb( height: u32, ) { let empty_vec = vec![]; - xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { LAB as u8 }>( + xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::LAB as u8 }>( src, src_stride, &empty_vec, @@ -356,7 +361,7 @@ pub fn laba_to_srgb( width: u32, height: u32, ) { - xyz_to_channels::<{ ImageConfiguration::Rgba as u8 }, true, { LAB as u8 }>( + xyz_to_channels::<{ ImageConfiguration::Rgba as u8 }, true, { XyzTarget::LAB as u8 }>( src, src_stride, &a_plane, @@ -393,7 +398,7 @@ pub fn xyza_to_rgba( matrix: &[[f32; 3]; 3], transfer_function: TransferFunction, ) { - xyz_to_channels::<{ ImageConfiguration::Rgba as u8 }, true, { XYZ as u8 }>( + xyz_to_channels::<{ ImageConfiguration::Rgba as u8 }, true, { XyzTarget::XYZ as u8 }>( src, src_stride, &a_plane, @@ -425,7 +430,7 @@ pub fn luv_to_rgb( height: u32, ) { let empty_vec = vec![]; - xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { LUV as u8 }>( + xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::LUV as u8 }>( src, src_stride, &empty_vec, @@ -457,7 +462,71 @@ pub fn luv_to_bgr( height: u32, ) { let empty_vec = vec![]; - xyz_to_channels::<{ ImageConfiguration::Bgr as u8 }, false, { LUV as u8 }>( + xyz_to_channels::<{ ImageConfiguration::Bgr as u8 }, false, { XyzTarget::LUV as u8 }>( + src, + src_stride, + &empty_vec, + 0, + dst, + dst_stride, + width, + height, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts LCH to RGB. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LAB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive RGB data +/// * `dst_stride` - Bytes per row for dst data +pub fn lch_to_rgb( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, +) { + let empty_vec = vec![]; + xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { XyzTarget::LCH as u8 }>( + src, + src_stride, + &empty_vec, + 0, + dst, + dst_stride, + width, + height, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts LCH to RGB. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LAB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive RGB data +/// * `dst_stride` - Bytes per row for dst data +pub fn lch_to_bgr( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, +) { + let empty_vec = vec![]; + xyz_to_channels::<{ ImageConfiguration::Bgr as u8 }, false, { XyzTarget::LCH as u8 }>( src, src_stride, &empty_vec, diff --git a/src/xyz_target.rs b/src/xyz_target.rs new file mode 100644 index 0000000..63af39a --- /dev/null +++ b/src/xyz_target.rs @@ -0,0 +1,21 @@ +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub(crate) enum XyzTarget { + LAB = 0, + XYZ = 1, + LUV = 2, + LCH = 3, +} + +impl From for XyzTarget { + fn from(value: u8) -> Self { + match value { + 0 => XyzTarget::LAB, + 1 => XyzTarget::XYZ, + 2 => XyzTarget::LUV, + 3 => XyzTarget::LCH, + _ => { + panic!("Not implemented") + } + } + } +} diff --git a/src/xyza_laba_to_image.rs b/src/xyza_laba_to_image.rs index 9c3e8ef..f0511ba 100644 --- a/src/xyza_laba_to_image.rs +++ b/src/xyza_laba_to_image.rs @@ -5,8 +5,6 @@ use crate::avx::avx_xyza_to_image; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget; -use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ}; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" @@ -17,7 +15,8 @@ use crate::neon::neon_xyza_to_image; target_feature = "sse4.1" ))] use crate::sse::sse_xyza_to_image; -use crate::{Lab, Luv, Xyz, XYZ_TO_SRGB_D65}; +use crate::xyz_target::XyzTarget; +use crate::{LCh, Lab, Luv, Xyz, XYZ_TO_SRGB_D65}; fn xyz_with_alpha_to_channels( src: &[f32], @@ -72,32 +71,51 @@ fn xyz_with_alpha_to_channels( - cx, - src.as_ptr(), - src_offset, - dst.as_mut_ptr(), - dst_offset, - width, - &matrix, - transfer_function, - ) + if source != XyzTarget::LCH { + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" + ))] + unsafe { + if _has_avx2 { + cx = avx_xyza_to_image::( + cx, + src.as_ptr(), + src_offset, + dst.as_mut_ptr(), + dst_offset, + width, + &matrix, + transfer_function, + ) + } } - } - #[cfg(all( - any(target_arch = "x86_64", target_arch = "x86"), - target_feature = "sse4.1" - ))] - unsafe { - if _has_sse { - cx = sse_xyza_to_image::( + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + cx = sse_xyza_to_image::( + cx, + src.as_ptr(), + src_offset, + dst.as_mut_ptr(), + dst_offset, + width, + &matrix, + transfer_function, + ) + } + } + + #[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" + ))] + unsafe { + cx = neon_xyza_to_image::( cx, src.as_ptr(), src_offset, @@ -110,23 +128,6 @@ fn xyz_with_alpha_to_channels( - cx, - src.as_ptr(), - src_offset, - dst.as_mut_ptr(), - dst_offset, - width, - &matrix, - transfer_function, - ) - } - let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 }; let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) }; @@ -137,18 +138,22 @@ fn xyz_with_alpha_to_channels { + XyzTarget::LAB => { let lab = Lab::new(l_x, l_y, l_z); rgb = lab.to_rgb(); } - XYZ => { + XyzTarget::XYZ => { let xyz = Xyz::new(l_x, l_y, l_z); rgb = xyz.to_rgb(&matrix, transfer_function); } - LUV => { + XyzTarget::LUV => { let luv = Luv::new(l_x, l_y, l_z); rgb = luv.to_rgb(); } + XyzTarget::LCH => { + let lch = LCh::new(l_x, l_y, l_z); + rgb = lch.to_rgb(); + } } let l_a = unsafe { src_ptr.add(px + 3).read_unaligned() }; @@ -188,7 +193,7 @@ pub fn lab_with_alpha_to_rgba( width: u32, height: u32, ) { - xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { LAB as u8 }>( + xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -217,7 +222,7 @@ pub fn lab_with_alpha_to_bgra( width: u32, height: u32, ) { - xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { LAB as u8 }>( + xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -246,7 +251,7 @@ pub fn luv_with_alpha_to_rgba( width: u32, height: u32, ) { - xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { LUV as u8 }>( + xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::LUV as u8 }>( src, src_stride, dst, @@ -277,7 +282,7 @@ pub fn luv_with_alpha_to_bgra( width: u32, height: u32, ) { - xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { LAB as u8 }>( + xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::LAB as u8 }>( src, src_stride, dst, @@ -306,7 +311,7 @@ pub fn xyz_with_alpha_to_rgba( width: u32, height: u32, ) { - xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { XYZ as u8 }>( + xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::XYZ as u8 }>( src, src_stride, dst, @@ -335,7 +340,65 @@ pub fn xyz_with_alpha_to_bgra( width: u32, height: u32, ) { - xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { XYZ as u8 }>( + xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::XYZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts LCH with separate alpha channel to RGBA. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LCHa data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive RGBA data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +pub fn lch_with_alpha_to_rgba( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, +) { + xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::LCH as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts LCH with separate alpha channel to BGRA. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LCHa data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive BGRA data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +pub fn lch_with_alpha_to_bgra( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, +) { + xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::LCH as u8 }>( src, src_stride, dst,