From 69af0da8174d4277334a03acacdb174a0758169e Mon Sep 17 00:00:00 2001 From: awxkee Date: Wed, 5 Jun 2024 23:57:40 +0100 Subject: [PATCH] LUV bugfixes, improve LCh --- src/app/src/main.rs | 105 +++++++++++---------------------- src/avx2_to_xyz_lab.rs | 56 ++++++++++++++++++ src/concat_alpha.rs | 1 + src/image_to_linear.rs | 2 +- src/image_to_linear_u8.rs | 5 +- src/image_to_xyz_lab.rs | 77 +++++++++++++++++++++++- src/image_xyza_laba.rs | 82 ++++++++++++++++++++++++- src/lib.rs | 8 +++ src/linear_to_image.rs | 5 +- src/linear_to_image_u8.rs | 7 ++- src/luv.rs | 28 ++++----- src/neon_math.rs | 87 +++++++++++++++++++++++---- src/neon_to_xyz_lab.rs | 60 +++++++++++++++++++ src/neon_to_xyza_laba.rs | 29 +++++++++ src/neon_xyz_lab_to_image.rs | 62 +++++++++++++++++-- src/neon_xyza_laba_to_image.rs | 11 ++++ src/rgb_expand.rs | 1 + src/sse_to_xyz_lab.rs | 56 ++++++++++++++++++ src/xyz_lab_to_image.rs | 72 +++++++++++++++++++++- src/xyza_laba_to_image.rs | 71 +++++++++++++++++++++- 20 files changed, 714 insertions(+), 111 deletions(-) diff --git a/src/app/src/main.rs b/src/app/src/main.rs index ecbc2ff..dd643a4 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -27,13 +27,13 @@ fn main() { // _mm_storeu_ps(dst.as_mut_ptr() as *mut f32, ln); // println!("{:?}", dst); // } - // #[cfg(target_arch = "aarch64")] - // unsafe { - // let m = vdupq_n_f32(std::f32::consts::E); - // let cbrt = vlogq_f32_ulp35(m); - // let l = vgetq_lane_f32::<0>(cbrt); - // println!("Exp {}", l); - // } + #[cfg(target_arch = "aarch64")] + unsafe { + let m = vdupq_n_f32(27f32); + let cbrt = vcbrtq_f32_ulp2(m); + let l = vgetq_lane_f32::<0>(cbrt); + println!("Cbrt {}", l); + } let img = ImageReader::open("./assets/asset_middle.jpg") .unwrap() @@ -68,9 +68,7 @@ fn main() { let mut lab_store: Vec = vec![]; let store_stride = width as usize * 4usize * std::mem::size_of::(); lab_store.resize(width as usize * 4usize * height as usize, 0f32); - let mut alpha_store: Vec = vec![]; - let alpha_stride = width as usize * std::mem::size_of::(); - alpha_store.resize(width as usize * height as usize, 0f32); + let start_time = Instant::now(); rgba_to_lab_with_alpha( src_bytes, 4u32 * width, @@ -110,6 +108,10 @@ fn main() { height, ); + let elapsed_time = start_time.elapsed(); + // Print the elapsed time in milliseconds + println!("Fast image resize: {:.2?}", elapsed_time); + // laba_to_srgb( // &lab_store, // lab_stride as u32, @@ -124,51 +126,16 @@ fn main() { src_bytes = &dst_slice; } - let mut xyz: Vec = vec![]; - xyz.resize(4 * width as usize * height as usize, 0f32); - - let mut a_plane: Vec = vec![]; - a_plane.resize(width as usize * height as usize, 0f32); - - for i in 0..1 { - let start_time = Instant::now(); - // srgba_to_xyza( - // src_bytes, - // width * components, - // &mut xyz, - // width * 3 * std::mem::size_of::() as u32, - // &mut a_plane, - // width as u32 * std::mem::size_of::() as u32, - // width, - // height, - // ); - // rgba_to_linear( - // src_bytes, - // width * components, - // &mut xyz, - // width * 3 * std::mem::size_of::() as u32, - // width, - // height, - // TransferFunction::Srgb, - // ); - rgba_to_linear( - src_bytes, - width * components, - &mut xyz, - width * 4 * std::mem::size_of::() as u32, - width, - height, - TransferFunction::Srgb, - ); - let elapsed_time = start_time.elapsed(); - // Print the elapsed time in milliseconds - println!("sRGB to XYZ: {:.2?}", elapsed_time); - } - - let mut dst_bytes: Vec = vec![]; - dst_bytes.resize(width as usize * components as usize * height as usize, 0u8); - - let start_time = Instant::now(); + // let mut xyz: Vec = vec![]; + // xyz.resize(4 * width as usize * height as usize, 0f32); + // + // let mut a_plane: Vec = vec![]; + // a_plane.resize(width as usize * height as usize, 0f32); + // + // let mut dst_bytes: Vec = vec![]; + // dst_bytes.resize(width as usize * components as usize * height as usize, 0u8); + // + // let start_time = Instant::now(); // xyz_to_srgb( // &xyz, // width * 3 * std::mem::size_of::() as u32, @@ -177,16 +144,16 @@ fn main() { // width, // height, // ); - - linear_to_rgba( - &xyz, - width * 4 * std::mem::size_of::() as u32, - &mut dst_bytes, - width * components, - width, - height, - TransferFunction::Srgb, - ); + // + // linear_to_rgba( + // &xyz, + // width * 4 * std::mem::size_of::() as u32, + // &mut dst_bytes, + // width * components, + // width, + // height, + // TransferFunction::Srgb, + // ); // linear_to_rgb( // &xyz, @@ -198,16 +165,16 @@ fn main() { // TransferFunction::Srgb, // ); - let elapsed_time = start_time.elapsed(); + // let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds - println!("XYZ to sRGB: {:.2?}", elapsed_time); + // println!("XYZ to sRGB: {:.2?}", elapsed_time); // let rgba = rgb_to_rgba(&dst_bytes, width, height); if components == 4 { image::save_buffer( "converted.png", - dst_bytes.as_bytes(), + src_bytes.as_bytes(), dimensions.0, dimensions.1, image::ExtendedColorType::Rgba8, @@ -216,7 +183,7 @@ fn main() { } else { image::save_buffer( "converted.jpg", - dst_bytes.as_bytes(), + src_bytes.as_bytes(), dimensions.0, dimensions.1, image::ExtendedColorType::Rgb8, diff --git a/src/avx2_to_xyz_lab.rs b/src/avx2_to_xyz_lab.rs index 6b19bc9..8a64fde 100644 --- a/src/avx2_to_xyz_lab.rs +++ b/src/avx2_to_xyz_lab.rs @@ -21,6 +21,7 @@ use crate::x86_64_simd_support::*; use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; +use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y}; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] @@ -66,6 +67,37 @@ unsafe fn avx2_triple_to_xyz( (x, y, z) } +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +pub(crate) unsafe fn avx2_triple_to_luv( + x: __m256, + y: __m256, + z: __m256, +) -> (__m256, __m256, __m256) { + let zeros = _mm256_setzero_ps(); + let den = _mm256_prefer_fma_ps( + _mm256_prefer_fma_ps(x, z, _mm256_set1_ps(3f32)), + y, + _mm256_set1_ps(15f32), + ); + let nan_mask = _mm256_cmp_ps::<_CMP_LT_OS>(den, _mm256_set1_ps(0f32)); + let l_low_mask = _mm256_cmp_ps::<_CMP_LT_OS>(y, _mm256_set1_ps(LUV_CUTOFF_FORWARD_Y)); + let y_cbrt = _mm256_cbrt_ps(y); + let l = _mm256_select_ps( + l_low_mask, + _mm256_mul_ps(y, _mm256_set1_ps(LUV_MULTIPLIER_FORWARD_Y)), + _mm256_prefer_fma_ps(_mm256_set1_ps(-16f32), y_cbrt, _mm256_set1_ps(116f32)), + ); + let u_prime = _mm256_div_ps(_mm256_mul_ps(x, _mm256_set1_ps(4f32)), den); + let v_prime = _mm256_div_ps(_mm256_mul_ps(y, _mm256_set1_ps(9f32)), den); + let sub_u_prime = _mm256_sub_ps(u_prime, _mm256_set1_ps(crate::luv::LUV_WHITE_U_PRIME)); + let sub_v_prime = _mm256_sub_ps(v_prime, _mm256_set1_ps(crate::luv::LUV_WHITE_V_PRIME)); + let l13 = _mm256_mul_ps(l, _mm256_set1_ps(13f32)); + let u = _mm256_select_ps(nan_mask, zeros, _mm256_mul_ps(l13, sub_u_prime)); + let v = _mm256_select_ps(nan_mask, zeros, _mm256_mul_ps(l13, sub_v_prime)); + (l, u, v) +} + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] #[allow(dead_code)] @@ -191,6 +223,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab< z_low_low = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = avx2_triple_to_luv(x_low_low, y_low_low, z_low_low); + x_low_low = l; + y_low_low = u; + z_low_low = v; + } } let write_dst_ptr = dst_ptr.add(cx * 3); @@ -218,6 +256,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab< z_low_high = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = avx2_triple_to_luv(x_low_high, y_low_high, z_low_high); + x_low_high = l; + y_low_high = u; + z_low_high = v; + } } let (v0, v1, v2) = avx2_interleave_rgb_ps(x_low_high, y_low_high, z_low_high); @@ -246,6 +290,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab< z_high_low = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = avx2_triple_to_luv(x_high_low, y_high_low, z_high_low); + x_high_low = l; + y_high_low = u; + z_high_low = v; + } } let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_low, y_high_low, z_high_low); @@ -281,6 +331,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab< z_high_high = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = avx2_triple_to_luv(x_high_high, y_high_high, z_high_high); + x_high_high = l; + y_high_high = u; + z_high_high = v; + } } let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_high, y_high_high, z_high_high); diff --git a/src/concat_alpha.rs b/src/concat_alpha.rs index 0d657d8..30d804c 100644 --- a/src/concat_alpha.rs +++ b/src/concat_alpha.rs @@ -37,6 +37,7 @@ pub fn append_alpha( #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { + #[cfg(target_feature = "sse4.1")] if is_x86_feature_detected!("sse4.1") { _use_sse = true; } diff --git a/src/image_to_linear.rs b/src/image_to_linear.rs index 67ce497..8997066 100644 --- a/src/image_to_linear.rs +++ b/src/image_to_linear.rs @@ -38,7 +38,7 @@ fn channels_to_linear( #[cfg(target_arch = "x86_64")] let mut has_sse = false; - #[cfg(target_arch = "x86_64")] + #[cfg(all(target_arch = "x86_64", target_feature = "sse4.1"))] if is_x86_feature_detected!("sse4.1") { has_sse = true; } diff --git a/src/image_to_linear_u8.rs b/src/image_to_linear_u8.rs index 107d1d0..1928117 100644 --- a/src/image_to_linear_u8.rs +++ b/src/image_to_linear_u8.rs @@ -45,7 +45,10 @@ fn channels_to_linear( #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] let mut _has_sse = false; - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] if is_x86_feature_detected!("sse4.1") { _has_sse = true; } diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs index 62725e5..3536ad8 100644 --- a/src/image_to_xyz_lab.rs +++ b/src/image_to_xyz_lab.rs @@ -3,7 +3,7 @@ use crate::avx2_to_xyz_lab::*; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget::{LAB, XYZ}; +use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ}; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" @@ -17,6 +17,7 @@ use std::slice; pub(crate) enum XyzTarget { LAB = 0, XYZ = 1, + LUV = 2, } impl From for XyzTarget { @@ -24,6 +25,7 @@ impl From for XyzTarget { match value { 0 => LAB, 1 => XYZ, + 2 => LUV, _ => { panic!("Not implemented") } @@ -69,6 +71,7 @@ fn channels_to_xyz { + let luv = rgb.to_luv(); + unsafe { + *dst_slice.get_unchecked_mut(x * 3) = luv.l; + *dst_slice.get_unchecked_mut(x * 3 + 1) = luv.u; + *dst_slice.get_unchecked_mut(x * 3 + 2) = luv.v; + } + } } if USE_ALPHA && image_configuration.has_alpha() { @@ -611,3 +622,67 @@ pub fn bgr_to_lab( TransferFunction::Srgb, ); } + +/// This function converts RGB to CIE L*uv against D65 white point. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive LAB data +/// * `dst_stride` - Bytes per row for dst data +pub fn rgb_to_luv( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, +) { + let mut empty_vec = vec![]; + channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { LUV as u8 }>( + src, + src_stride, + dst, + dst_stride, + &mut empty_vec, + 0, + width, + height, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts BGR to CIE L*ab against D65 white point. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains BGR data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive LAB data +/// * `dst_stride` - Bytes per row for dst data +pub fn bgr_to_luv( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, +) { + let mut empty_vec = vec![]; + channels_to_xyz::<{ ImageConfiguration::Bgr as u8 }, false, { LUV as u8 }>( + src, + src_stride, + dst, + dst_stride, + &mut empty_vec, + 0, + width, + height, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); +} diff --git a/src/image_xyza_laba.rs b/src/image_xyza_laba.rs index acb1890..e58ac2f 100644 --- a/src/image_xyza_laba.rs +++ b/src/image_xyza_laba.rs @@ -1,6 +1,6 @@ use crate::image::ImageConfiguration; use crate::image_to_xyz_lab::XyzTarget; -use crate::image_to_xyz_lab::XyzTarget::{LAB, XYZ}; +use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ}; use crate::{Rgb, TransferFunction, Xyz, SRGB_TO_XYZ_D65}; use std::slice; #[cfg(all( @@ -73,11 +73,11 @@ fn channels_to_xyz_with_alpha::new(r, g, b); + let px = x * CHANNELS; match target { LAB => { let lab = rgb.to_lab(); unsafe { - let px = x * CHANNELS; *dst_slice.get_unchecked_mut(px) = lab.l; *dst_slice.get_unchecked_mut(px + 1) = lab.a; *dst_slice.get_unchecked_mut(px + 2) = lab.b; @@ -85,13 +85,20 @@ fn channels_to_xyz_with_alpha { let xyz = Xyz::from_rgb(&rgb, &matrix, transfer_function); - let px = x * CHANNELS; unsafe { *dst_slice.get_unchecked_mut(px) = xyz.x; *dst_slice.get_unchecked_mut(px + 1) = xyz.y; *dst_slice.get_unchecked_mut(px + 2) = xyz.z; } } + XyzTarget::LUV => { + let luv = rgb.to_luv(); + unsafe { + *dst_slice.get_unchecked_mut(px) = luv.l; + *dst_slice.get_unchecked_mut(px + 1) = luv.u; + *dst_slice.get_unchecked_mut(px + 2) = luv.v; + } + } } let a = unsafe { @@ -175,3 +182,72 @@ pub fn bgra_to_lab_with_alpha( TransferFunction::Srgb, ); } + + +/// This function converts RGBA to CIE L*uv against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGBA data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive LAB(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `a_plane` - A mutable slice to receive XYZ data +/// * `a_stride` - Bytes per row for dst data +pub fn rgba_to_luv_with_alpha( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, +) { + channels_to_xyz_with_alpha::< + { ImageConfiguration::Rgba as u8 }, + { LUV as u8 }, + >( + src, + src_stride, + dst, + dst_stride, + width, + height, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts BGRA to CIE L*uv against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains BGRA data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive LAB(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `a_plane` - A mutable slice to receive XYZ data +/// * `a_stride` - Bytes per row for dst data +pub fn bgra_to_luv_with_alpha( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, +) { + channels_to_xyz_with_alpha::< + { ImageConfiguration::Bgra as u8 }, + { LUV as u8 }, + >( + src, + src_stride, + dst, + dst_stride, + width, + height, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); +} diff --git a/src/lib.rs b/src/lib.rs index dd5786d..87ada82 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -67,18 +67,26 @@ pub use image_to_xyz_lab::bgra_to_laba; pub use image_to_xyz_lab::bgr_to_lab; pub use image_to_xyz_lab::srgb_to_xyz; pub use image_to_xyz_lab::rgba_to_lab; +pub use image_to_xyz_lab::bgr_to_luv; +pub use image_to_xyz_lab::rgb_to_luv; pub use xyz_lab_to_image::xyz_to_rgb; pub use xyz_lab_to_image::lab_to_srgb; pub use xyz_lab_to_image::xyz_to_srgb; pub use xyz_lab_to_image::laba_to_srgb; pub use xyz_lab_to_image::xyza_to_rgba; +pub use xyz_lab_to_image::luv_to_rgb; +pub use xyz_lab_to_image::luv_to_bgr; pub use image_to_linear::*; pub use linear_to_image::*; pub use concat_alpha::append_alpha; pub use image_xyza_laba::rgba_to_lab_with_alpha; pub use image_xyza_laba::bgra_to_lab_with_alpha; +pub use image_xyza_laba::rgba_to_luv_with_alpha; +pub use image_xyza_laba::bgra_to_luv_with_alpha; pub use xyza_laba_to_image::lab_with_alpha_to_bgra; pub use xyza_laba_to_image::lab_with_alpha_to_rgba; +pub use xyza_laba_to_image::luv_with_alpha_to_bgra; +pub use xyza_laba_to_image::luv_with_alpha_to_rgba; pub use image_to_linear_u8::*; pub use linear_to_image_u8::*; diff --git a/src/linear_to_image.rs b/src/linear_to_image.rs index 730b0f0..4304104 100644 --- a/src/linear_to_image.rs +++ b/src/linear_to_image.rs @@ -38,7 +38,10 @@ fn linear_to_gamma_channels( @@ -45,7 +45,10 @@ fn linear_to_gamma_channels) -> Self { let xyz = Xyz::from_srgb(rgb); let [x, y, z] = [xyz.x, xyz.y, xyz.z]; let den = x + 15.0 * y + 3.0 * z; - let l = (if y < CUTOFF_FORWARD_Y { - MULTIPLIER_FORWARD_Y * y + let l = (if y < LUV_CUTOFF_FORWARD_Y { + LUV_MULTIPLIER_FORWARD_Y * y } else { 116f32 * y.cbrt() - 16f32 }) @@ -73,10 +73,10 @@ impl Luv { .max(0f32); let (u, v); if den != 0f32 { - let u_prime = 4.0 * x / den; - let v_prime = 9.0 * y / den; - u = 13f32 * l * (u_prime - WHITE_U_PRIME); - v = 13f32 * l * (v_prime - WHITE_V_PRIME); + let u_prime = 4f32 * x / den; + let v_prime = 9f32 * y / den; + u = 13f32 * l * (u_prime - LUV_WHITE_U_PRIME); + v = 13f32 * l * (v_prime - LUV_WHITE_V_PRIME); } else { u = 0f32; v = 0f32; @@ -95,12 +95,12 @@ impl Luv { return Xyz::new(0f32, 0f32, 0f32).to_srgb(); } let l13 = 1f32 / (13f32 * self.l); - let u = self.u * l13 + WHITE_U_PRIME; - let v = self.v * l13 + WHITE_V_PRIME; + let u = self.u * l13 + LUV_WHITE_U_PRIME; + let v = self.v * l13 + LUV_WHITE_V_PRIME; let y = if self.l > 8f32 { ((self.l + 16f32) / 116f32).powi(3) } else { - self.l * MULTIPLIER_INVERSE_Y + self.l * LUV_MULTIPLIER_INVERSE_Y }; let (x, z); if v != 0f32 { diff --git a/src/neon_math.rs b/src/neon_math.rs index 76982cb..e7712b1 100644 --- a/src/neon_math.rs +++ b/src/neon_math.rs @@ -64,7 +64,7 @@ pub unsafe fn vrintq_s32(d: float32x4_t) -> int32x4_t { vreinterpretq_u32_f32(vdupq_n_f32(-0.0f32)), ), vreinterpretq_u32_f32(vdupq_n_f32(0.5f32)), - ),) + )), )); } @@ -144,7 +144,11 @@ pub unsafe fn vexpq_f32_ulp1(d: float32x4_t) -> float32x4_t { let q = vrintq_s32(vmulq_f32(d, vdupq_n_f32(std::f32::consts::LOG2_E))); let mut s = vmlafq_f32(vcvtq_f32_s32(q), vdupq_n_f32(-std::f32::consts::LN_2), d); - s = vmlafq_f32(vcvtq_f32_s32(q), vdupq_n_f32(-1.428606765330187045e-06f32), s); + s = vmlafq_f32( + vcvtq_f32_s32(q), + vdupq_n_f32(-1.428606765330187045e-06f32), + s, + ); let mut u = vdupq_n_f32(0.000198527617612853646278381f32); u = vmlafq_f32(u, s, vdupq_n_f32(0.00139304355252534151077271f32)); @@ -157,8 +161,15 @@ pub unsafe fn vexpq_f32_ulp1(d: float32x4_t) -> float32x4_t { u = vldexp2q_f32(u, q); - u = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vcltq_f32(d, vdupq_n_f32(-104f32)))); - u = vbslq_f32(vcltq_f32(vdupq_n_f32(100f32), d), vdupq_n_f32(f32::INFINITY), u); + u = vreinterpretq_f32_u32(vbicq_u32( + vreinterpretq_u32_f32(u), + vcltq_f32(d, vdupq_n_f32(-104f32)), + )); + u = vbslq_f32( + vcltq_f32(vdupq_n_f32(100f32), d), + vdupq_n_f32(f32::INFINITY), + u, + ); u } @@ -204,7 +215,9 @@ pub unsafe fn vlogq_f32(x: float32x4_t) -> float32x4_t { ))] #[inline(always)] #[allow(dead_code)] -pub unsafe fn visnanq_f32(x: float32x4_t) -> uint32x4_t { return vmvnq_u32(vceqq_f32(x, x)); } +pub unsafe fn visnanq_f32(x: float32x4_t) -> uint32x4_t { + return vmvnq_u32(vceqq_f32(x, x)); +} #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), @@ -212,7 +225,9 @@ pub unsafe fn visnanq_f32(x: float32x4_t) -> uint32x4_t { return vmvnq_u32(vceqq ))] #[inline(always)] #[allow(dead_code)] -pub unsafe fn vispinfq_f32(d: float32x4_t) -> uint32x4_t { return vceqq_f32(d, vdupq_n_f32(f32::INFINITY)); } +pub unsafe fn vispinfq_f32(d: float32x4_t) -> uint32x4_t { + return vceqq_f32(d, vdupq_n_f32(f32::INFINITY)); +} #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), @@ -225,11 +240,14 @@ pub unsafe fn vlogq_f32_ulp35(d: float32x4_t) -> float32x4_t { let o = vceqq_f32(d, vdupq_n_f32(f32::MIN)); let m = (1i64 << 32i64) as f32; let d = vbslq_f32(o, vmulq_f32(d, vdupq_n_f32(m * m)), d); - let e = vilogbk_vi2_vf(vmulq_f32(d, vdupq_n_f32(1.0f32/0.75f32))); + let e = vilogbk_vi2_vf(vmulq_f32(d, vdupq_n_f32(1.0f32 / 0.75f32))); let m = vldexp2q_f32(d, vnegq_s32(e)); let e = vbslq_s32(o, vsubq_s32(e, vdupq_n_s32(64)), e); - let mut x = vdivq_f32(vsubq_f32(m, vdupq_n_f32(1.0f32)), vaddq_f32(vdupq_n_f32(1.0f32), m)); + let mut x = vdivq_f32( + vsubq_f32(m, vdupq_n_f32(1.0f32)), + vaddq_f32(vdupq_n_f32(1.0f32), m), + ); let x2 = vmulq_f32(x, x); let mut t = vdupq_n_f32(0.2392828464508056640625f32); @@ -238,9 +256,17 @@ pub unsafe fn vlogq_f32_ulp35(d: float32x4_t) -> float32x4_t { t = vmlafq_f32(t, x2, vdupq_n_f32(0.666666686534881591796875f32)); t = vmlafq_f32(t, x2, vdupq_n_f32(2.0f32)); - x = vmlafq_f32(x, t, vmulq_f32(vdupq_n_f32(std::f32::consts::LN_2), vcvtq_f32_s32(e))); + x = vmlafq_f32( + x, + t, + vmulq_f32(vdupq_n_f32(std::f32::consts::LN_2), vcvtq_f32_s32(e)), + ); x = vbslq_f32(vispinfq_f32(d), vdupq_n_f32(f32::NAN), x); - x = vbslq_f32(vorrq_u32(vcltq_f32(d, vdupq_n_f32(0f32)), visnanq_f32(d)), vdupq_n_f32(f32::NAN), x); + x = vbslq_f32( + vorrq_u32(vcltq_f32(d, vdupq_n_f32(0f32)), visnanq_f32(d)), + vdupq_n_f32(f32::NAN), + x, + ); x = vbslq_f32(vceqq_f32(d, vdupq_n_f32(0f32)), vdupq_n_f32(-f32::NAN), x); return x; } @@ -351,6 +377,47 @@ pub unsafe fn vcbrtq_f32(d: float32x4_t) -> float32x4_t { vpowq_n_f32(d, 1f32 / 3f32) } +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] +#[inline(always)] +#[allow(dead_code)] +/// Precise version of Cube Root with ULP 2 +pub unsafe fn vcbrtq_f32_ulp2(x: float32x4_t) -> float32x4_t { + let x1p24 = vreinterpretq_f32_u32(vdupq_n_u32(0x4b800000)); // 0x1p24f === 2 ^ 24 + + let mut ui = vreinterpretq_u32_f32(x); + let hx = vandq_u32(ui, vdupq_n_u32(0x7fffffff)); + + let nan_mask = vcgeq_u32(hx, vdupq_n_u32(0x7f800000)); + let is_zero_mask = vceqzq_u32(hx); + + let lo_mask = vcltq_u32(hx, vdupq_n_u32(0x00800000)); + let hi_ui_f = vreinterpretq_u32_f32(vmulq_f32(x, x1p24)); + let mut lo_hx = vandq_u32(hi_ui_f, vdupq_n_u32(0x7fffffff)); + lo_hx = vaddq_u32(vcvtq_u32_f32(vmulq_n_f32(vcvtq_f32_u32(lo_hx), 1f32/3f32)), vdupq_n_u32(642849266)); + let hi_hx = vaddq_u32(vcvtq_u32_f32(vmulq_n_f32(vcvtq_f32_u32(hx), 1f32/3f32)), vdupq_n_u32(709958130)); + let hx = vbslq_u32(lo_mask, lo_hx, hi_hx); + + ui = vbslq_u32(lo_mask, hi_ui_f, ui); + ui = vandq_u32(ui, vdupq_n_u32(0x80000000)); + ui = vorrq_u32(ui, hx); + + let mut t = vreinterpretq_f32_u32(ui); + let mut r = vmulq_f32(vmulq_f32(t, t), t); + + let sum_x = vaddq_f32(x, x); + + t = vmulq_f32(vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)), t); + + r = vmulq_f32(vmulq_f32(t, t), t); + t = vmulq_f32(vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)), t); + t = vbslq_f32(nan_mask, vdupq_n_f32(f32::NAN), t); + t = vbslq_f32(is_zero_mask, vdupq_n_f32(0f32), t); + t +} + #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" diff --git a/src/neon_to_xyz_lab.rs b/src/neon_to_xyz_lab.rs index 6936176..54b49e2 100644 --- a/src/neon_to_xyz_lab.rs +++ b/src/neon_to_xyz_lab.rs @@ -5,6 +5,8 @@ use crate::image::ImageConfiguration; #[allow(unused_imports)] use crate::image_to_xyz_lab::XyzTarget; #[allow(unused_imports)] +use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y}; +#[allow(unused_imports)] use crate::neon_gamma_curves::*; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), @@ -63,6 +65,40 @@ pub(crate) unsafe fn neon_triple_to_xyz( (x, y, z) } +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] +#[inline(always)] +pub(crate) unsafe fn neon_triple_to_luv( + x: float32x4_t, + y: float32x4_t, + z: float32x4_t, +) -> (float32x4_t, float32x4_t, float32x4_t) { + let zeros = vdupq_n_f32(0f32); + let den = prefer_vfmaq_f32( + prefer_vfmaq_f32(x, z, vdupq_n_f32(3f32)), + y, + vdupq_n_f32(15f32), + ); + let nan_mask = vceqzq_f32(den); + let l_low_mask = vcltq_f32(y, vdupq_n_f32(LUV_CUTOFF_FORWARD_Y)); + let y_cbrt = vcbrtq_f32(y); + let l = vbslq_f32( + l_low_mask, + vmulq_n_f32(y, LUV_MULTIPLIER_FORWARD_Y), + prefer_vfmaq_f32(vdupq_n_f32(-16f32), y_cbrt, vdupq_n_f32(116f32)), + ); + let u_prime = vdivq_f32(vmulq_n_f32(x, 4f32), den); + let v_prime = vdivq_f32(vmulq_n_f32(y, 9f32), den); + let sub_u_prime = vsubq_f32(u_prime, vdupq_n_f32(crate::luv::LUV_WHITE_U_PRIME)); + let sub_v_prime = vsubq_f32(v_prime, vdupq_n_f32(crate::luv::LUV_WHITE_V_PRIME)); + let l13 = vmulq_n_f32(l, 13f32); + let u = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_u_prime)); + let v = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_v_prime)); + (l, u, v) +} + #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" @@ -191,6 +227,12 @@ pub(crate) unsafe fn neon_channels_to_xyz_or_lab< z_low_low = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low); + x_low_low = l; + y_low_low = u; + z_low_low = v; + } } let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low); @@ -213,6 +255,12 @@ pub(crate) unsafe fn neon_channels_to_xyz_or_lab< z_low_high = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high); + x_low_high = l; + y_low_high = u; + z_low_high = v; + } } let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high); @@ -239,6 +287,12 @@ pub(crate) unsafe fn neon_channels_to_xyz_or_lab< z_high_low = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = neon_triple_to_luv(x_high_low, y_high_low, z_high_low); + x_high_low = l; + y_high_low = u; + z_high_low = v; + } } let xyz_low_low = float32x4x3_t(x_high_low, y_high_low, z_high_low); @@ -272,6 +326,12 @@ pub(crate) unsafe fn neon_channels_to_xyz_or_lab< z_high_high = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = neon_triple_to_luv(x_high_high, y_high_high, z_high_high); + x_high_high = l; + y_high_high = u; + z_high_high = v; + } } let xyz_low_low = float32x4x3_t(x_high_high, y_high_high, z_high_high); diff --git a/src/neon_to_xyza_laba.rs b/src/neon_to_xyza_laba.rs index 963391a..4a16914 100644 --- a/src/neon_to_xyza_laba.rs +++ b/src/neon_to_xyza_laba.rs @@ -21,6 +21,11 @@ use crate::neon_to_xyz_lab::get_neon_linear_transfer; any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" ))] +use crate::neon_to_xyz_lab::neon_triple_to_luv; +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] use crate::neon_to_xyz_lab::{neon_triple_to_lab, neon_triple_to_xyz}; #[cfg(all( @@ -113,6 +118,12 @@ pub(crate) unsafe fn neon_channels_to_xyza_or_laba< z_low_low = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low); + x_low_low = l; + y_low_low = u; + z_low_low = v; + } } let a_low = vmovl_u8(vget_low_u8(a_chan)); @@ -139,6 +150,12 @@ pub(crate) unsafe fn neon_channels_to_xyza_or_laba< z_low_high = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high); + x_low_high = l; + y_low_high = u; + z_low_high = v; + } } let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32); @@ -167,6 +184,12 @@ pub(crate) unsafe fn neon_channels_to_xyza_or_laba< z_high_low = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = neon_triple_to_luv(x_high_low, y_high_low, z_high_low); + x_high_low = l; + y_high_low = u; + z_high_low = v; + } } let a_high = vmovl_high_u8(a_chan); @@ -206,6 +229,12 @@ pub(crate) unsafe fn neon_channels_to_xyza_or_laba< z_high_high = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = neon_triple_to_luv(x_high_high, y_high_high, z_high_high); + x_high_high = l; + y_high_high = u; + z_high_high = v; + } } let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32); diff --git a/src/neon_xyz_lab_to_image.rs b/src/neon_xyz_lab_to_image.rs index 24a9332..5a0bcfa 100644 --- a/src/neon_xyz_lab_to_image.rs +++ b/src/neon_xyz_lab_to_image.rs @@ -6,12 +6,17 @@ use crate::image_to_xyz_lab::XyzTarget; any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" ))] +use crate::luv::*; +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] use crate::neon_linear_to_image::get_neon_gamma_transfer; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" ))] -use crate::neon_math::vcolorq_matrix_f32; +use crate::neon_math::*; #[allow(unused_imports)] use crate::TransferFunction; #[cfg(all( @@ -29,6 +34,50 @@ unsafe fn vcubeq_f32(x: float32x4_t) -> float32x4_t { vmulq_f32(vmulq_f32(x, x), x) } +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] +#[inline(always)] +pub(crate) unsafe fn neon_luv_to_xyz( + l: float32x4_t, + u: float32x4_t, + v: float32x4_t, +) -> (float32x4_t, float32x4_t, float32x4_t) { + let zero_mask = vclezq_f32(l); + let zeros = vdupq_n_f32(0f32); + let l13 = vrecpeq_f32(vmulq_n_f32(l, 13f32)); + let u = prefer_vfmaq_f32(vdupq_n_f32(LUV_WHITE_U_PRIME), l13, u); + let v = prefer_vfmaq_f32(vdupq_n_f32(LUV_WHITE_V_PRIME), l13, v); + let l_h = vmulq_n_f32(vaddq_f32(l, vdupq_n_f32(16f32)), 1f32 / 116f32); + let y_high = vmulq_f32(vmulq_f32(l_h, l_h), l_h); + let y_low = vmulq_n_f32(l, LUV_MULTIPLIER_INVERSE_Y); + let y = vbslq_f32( + zero_mask, + zeros, + vbslq_f32(vcgtq_f32(l, vdupq_n_f32(8f32)), y_high, y_low), + ); + let zero_mask_2 = vclezq_f32(v); + let den = vrecpeq_f32(vmulq_n_f32(v, 4f32)); + let mut x = vmulq_n_f32(vmulq_f32(vmulq_f32(y, u), den), 9f32); + x = vbslq_f32(zero_mask, zeros, x); + x = vbslq_f32(zero_mask_2, zeros, x); + let mut z = vmulq_f32( + vmulq_f32( + prefer_vfmaq_f32( + prefer_vfmaq_f32(vdupq_n_f32(12f32), vdupq_n_f32(-3f32), u), + v, + vdupq_n_f32(-20f32), + ), + y, + ), + den, + ); + z = vbslq_f32(zero_mask, zeros, z); + z = vbslq_f32(zero_mask_2, zeros, z); + (x, y, z) +} + #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" @@ -94,6 +143,12 @@ pub(crate) unsafe fn neon_xyz_lab_vld< g_f32 = y; b_f32 = z; } + XyzTarget::LUV => { + let (x, y, z) = neon_luv_to_xyz(r_f32, g_f32, b_f32); + r_f32 = x; + g_f32 = y; + b_f32 = z; + } _ => {} } @@ -246,10 +301,9 @@ pub unsafe fn neon_xyz_to_channels< let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23)); let dst_ptr = dst.add(dst_offset + cx * channels); - + if USE_ALPHA { - let offset_a_src_ptr = - ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); + let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); let a_low_0_f = vld1q_f32(offset_a_src_ptr); let a_row0_ = vcvtq_u32_f32(vmulq_n_f32(a_low_0_f, 255f32)); diff --git a/src/neon_xyza_laba_to_image.rs b/src/neon_xyza_laba_to_image.rs index f4c08bb..665da17 100644 --- a/src/neon_xyza_laba_to_image.rs +++ b/src/neon_xyza_laba_to_image.rs @@ -32,6 +32,11 @@ use crate::TransferFunction; target_feature = "neon" ))] use std::arch::aarch64::*; +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] +use crate::neon_xyz_lab_to_image::*; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), @@ -64,6 +69,12 @@ pub(crate) unsafe fn neon_xyza_lab_vld { + let (x, y, z) = neon_luv_to_xyz(r_f32, g_f32, b_f32); + r_f32 = x; + g_f32 = y; + b_f32 = z; + } _ => {} } diff --git a/src/rgb_expand.rs b/src/rgb_expand.rs index 6ba0758..fe7b39e 100644 --- a/src/rgb_expand.rs +++ b/src/rgb_expand.rs @@ -31,6 +31,7 @@ pub fn rgb_to_rgba( #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { + #[cfg(target_feature = "sse4.1")] if is_x86_feature_detected!("sse4.1") { _use_sse = true; } diff --git a/src/sse_to_xyz_lab.rs b/src/sse_to_xyz_lab.rs index 0a877a4..36dba83 100644 --- a/src/sse_to_xyz_lab.rs +++ b/src/sse_to_xyz_lab.rs @@ -17,6 +17,7 @@ use crate::x86_64_simd_support::*; use std::arch::x86_64::*; #[cfg(target_arch = "x86")] use std::arch::x86::*; +use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y}; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] pub unsafe fn get_sse_linear_transfer( @@ -59,6 +60,37 @@ unsafe fn sse_triple_to_xyz( (x, y, z) } +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +pub(crate) unsafe fn sse_triple_to_luv( + x: __m128, + y: __m128, + z: __m128, +) -> (__m128, __m128, __m128) { + let zeros = _mm_setzero_ps(); + let den = _mm_prefer_fma_ps( + _mm_prefer_fma_ps(x, z, _mm_set1_ps(3f32)), + y, + _mm_set1_ps(15f32), + ); + let nan_mask = _mm_cmpeq_ps(den, _mm_set1_ps(0f32)); + let l_low_mask = _mm_cmplt_ps(y, _mm_set1_ps(LUV_CUTOFF_FORWARD_Y)); + let y_cbrt = _mm_cbrt_ps(y); + let l = _mm_select_ps( + l_low_mask, + _mm_mul_ps(y, _mm_set1_ps(LUV_MULTIPLIER_FORWARD_Y)), + _mm_prefer_fma_ps(_mm_set1_ps(-16f32), y_cbrt, _mm_set1_ps(116f32)), + ); + let u_prime = _mm_div_ps(_mm_mul_ps(x, _mm_set1_ps(4f32)), den); + let v_prime = _mm_div_ps(_mm_mul_ps(y, _mm_set1_ps(9f32)), den); + let sub_u_prime = _mm_sub_ps(u_prime, _mm_set1_ps(crate::luv::LUV_WHITE_U_PRIME)); + let sub_v_prime = _mm_sub_ps(v_prime, _mm_set1_ps(crate::luv::LUV_WHITE_V_PRIME)); + let l13 = _mm_mul_ps(l, _mm_set1_ps(13f32)); + let u = _mm_select_ps(nan_mask, zeros, _mm_mul_ps(l13, sub_u_prime)); + let v = _mm_select_ps(nan_mask, zeros, _mm_mul_ps(l13, sub_v_prime)); + (l, u, v) +} + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] unsafe fn sse_triple_to_lab(x: __m128, y: __m128, z: __m128) -> (__m128, __m128, __m128) { @@ -182,6 +214,12 @@ pub(crate) unsafe fn sse_channels_to_xyz_or_lab< z_low_low = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = sse_triple_to_luv(x_low_low, y_low_low, z_low_low); + x_low_low = l; + y_low_low = u; + z_low_low = v; + } } let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_low, y_low_low, z_low_low); @@ -206,6 +244,12 @@ pub(crate) unsafe fn sse_channels_to_xyz_or_lab< z_low_high = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high); + x_low_high = l; + y_low_high = u; + z_low_high = v; + } } let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_high, y_low_high, z_low_high); @@ -234,6 +278,12 @@ pub(crate) unsafe fn sse_channels_to_xyz_or_lab< z_high_low = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = sse_triple_to_luv(x_high_low, y_high_low, z_high_low); + x_high_low = l; + y_high_low = u; + z_high_low = v; + } } let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_low, y_high_low, z_high_low); @@ -269,6 +319,12 @@ pub(crate) unsafe fn sse_channels_to_xyz_or_lab< z_high_high = b; } XyzTarget::XYZ => {} + XyzTarget::LUV => { + let (l, u, v) = sse_triple_to_luv(x_high_high, y_high_high, z_high_high); + x_high_high = l; + y_high_high = u; + z_high_high = v; + } } let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_high, y_high_high, z_high_high); diff --git a/src/xyz_lab_to_image.rs b/src/xyz_lab_to_image.rs index 13c8cb0..9c6156a 100644 --- a/src/xyz_lab_to_image.rs +++ b/src/xyz_lab_to_image.rs @@ -3,13 +3,13 @@ use std::slice; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; use crate::image_to_xyz_lab::XyzTarget; -use crate::image_to_xyz_lab::XyzTarget::{LAB, XYZ}; +use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ}; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" ))] use crate::neon_xyz_lab_to_image::neon_xyz_to_channels; -use crate::{Lab, Xyz, XYZ_TO_SRGB_D65}; +use crate::{Lab, Luv, Xyz, XYZ_TO_SRGB_D65}; fn xyz_to_channels( src: &[f32], @@ -95,6 +95,10 @@ fn xyz_to_channels { + let luv = Luv::new(l_x, l_y, l_z); + rgb = luv.to_rgb(); + } } dst_slice[x * channels + image_configuration.get_r_channel_offset()] = rgb.r; @@ -287,3 +291,67 @@ pub fn xyza_to_rgba( transfer_function, ); } + +/// This function converts LUV to RGB. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LAB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive RGB data +/// * `dst_stride` - Bytes per row for dst data +pub fn luv_to_rgb( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, +) { + let empty_vec = vec![]; + xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { LUV as u8 }>( + src, + src_stride, + &empty_vec, + 0, + dst, + dst_stride, + width, + height, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts LUV to RGB. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LAB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive RGB data +/// * `dst_stride` - Bytes per row for dst data +pub fn luv_to_bgr( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, +) { + let empty_vec = vec![]; + xyz_to_channels::<{ ImageConfiguration::Bgr as u8 }, false, { LUV as u8 }>( + src, + src_stride, + &empty_vec, + 0, + dst, + dst_stride, + width, + height, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); +} \ No newline at end of file diff --git a/src/xyza_laba_to_image.rs b/src/xyza_laba_to_image.rs index 5181bf7..cea50a0 100644 --- a/src/xyza_laba_to_image.rs +++ b/src/xyza_laba_to_image.rs @@ -3,13 +3,13 @@ use std::slice; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; use crate::image_to_xyz_lab::XyzTarget; -use crate::image_to_xyz_lab::XyzTarget::{LAB, XYZ}; -use crate::{Lab, Xyz, XYZ_TO_SRGB_D65}; +use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ}; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" ))] use crate::neon_xyza_laba_to_image::neon_xyza_to_image; +use crate::{Lab, Luv, Xyz, XYZ_TO_SRGB_D65}; fn xyz_with_alpha_to_channels( src: &[f32], @@ -53,7 +53,6 @@ fn xyz_with_alpha_to_channels { + let luv = Luv::new(l_x, l_y, l_z); + rgb = luv.to_rgb(); + } } let l_a = unsafe { *src_slice.get_unchecked(px + 3) }; @@ -160,3 +163,65 @@ pub fn lab_with_alpha_to_bgra( TransferFunction::Srgb, ); } + +/// This function converts LUV with separate alpha channel to RGBA. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LAB data +/// * `src_stride` - Bytes per row for src data. +/// * `a_plane` - A slice contains Alpha data +/// * `a_stride` - Bytes per row for alpha plane data +/// * `dst` - A mutable slice to receive RGBA data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +pub fn luv_with_alpha_to_rgba( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, +) { + xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { LUV as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); +} + +/// This function converts LUV with separate alpha channel to BGRA. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LAB data +/// * `src_stride` - Bytes per row for src data. +/// * `a_plane` - A slice contains Alpha data +/// * `a_stride` - Bytes per row for alpha plane data +/// * `dst` - A mutable slice to receive BGRA data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +pub fn luv_with_alpha_to_bgra( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, +) { + xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { LAB as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); +}