diff --git a/src/avx/to_xyz_lab.rs b/src/avx/to_xyz_lab.rs index d89348d..21b9634 100644 --- a/src/avx/to_xyz_lab.rs +++ b/src/avx/to_xyz_lab.rs @@ -5,13 +5,10 @@ use std::arch::x86_64::*; use crate::avx::gamma_curves::get_avx2_linear_transfer; use crate::avx::*; -#[allow(unused_imports)] use crate::gamma_curves::TransferFunction; -#[allow(unused_imports)] use crate::image::ImageConfiguration; -#[allow(unused_imports)] -use crate::image_to_xyz_lab::XyzTarget; use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y}; +use crate::xyz_target::XyzTarget; #[inline(always)] unsafe fn avx2_triple_to_xyz( @@ -200,6 +197,7 @@ pub unsafe fn avx2_image_to_xyz_lab< y_low_low = u; z_low_low = v; } + XyzTarget::LCH => {} } let write_dst_ptr = dst_ptr.add(cx * 3); @@ -233,6 +231,7 @@ pub unsafe fn avx2_image_to_xyz_lab< y_low_high = u; z_low_high = v; } + XyzTarget::LCH => {} } let (v0, v1, v2) = avx2_interleave_rgb_ps(x_low_high, y_low_high, z_low_high); @@ -267,6 +266,7 @@ pub unsafe fn avx2_image_to_xyz_lab< y_high_low = u; z_high_low = v; } + XyzTarget::LCH => {} } let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_low, y_high_low, z_high_low); @@ -308,6 +308,7 @@ pub unsafe fn avx2_image_to_xyz_lab< y_high_high = u; z_high_high = v; } + XyzTarget::LCH => {} } let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_high, y_high_high, z_high_high); diff --git a/src/avx/xyz_lab_to_image.rs b/src/avx/xyz_lab_to_image.rs index 256f732..7c83cce 100644 --- a/src/avx/xyz_lab_to_image.rs +++ b/src/avx/xyz_lab_to_image.rs @@ -5,7 +5,7 @@ use crate::avx::{ avx2_interleave_rgba_epi8, avx2_pack_s32, avx2_pack_u16, }; use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget; +use crate::xyz_target::XyzTarget; use crate::TransferFunction; #[cfg(target_arch = "x86")] use std::arch::x86::*; diff --git a/src/avx/xyza_laba_to_image.rs b/src/avx/xyza_laba_to_image.rs index c18efa0..79b2038 100644 --- a/src/avx/xyza_laba_to_image.rs +++ b/src/avx/xyza_laba_to_image.rs @@ -10,7 +10,7 @@ use crate::avx::{ avx2_pack_u16, }; use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget; +use crate::xyz_target::XyzTarget; use crate::TransferFunction; #[inline(always)] diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs index f73cf32..aa6de62 100644 --- a/src/image_to_xyz_lab.rs +++ b/src/image_to_xyz_lab.rs @@ -112,40 +112,40 @@ fn channels_to_xyz( - cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - a_channel.as_mut_ptr(), - a_offset, - &matrix, - transfer_function, - ) - } else { - cx = sse_channels_to_xyz_or_lab::( - cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - std::ptr::null_mut(), - 0usize, - &matrix, - transfer_function, - ) - } + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + if USE_ALPHA { + cx = sse_channels_to_xyz_or_lab::( + cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + a_channel.as_mut_ptr(), + a_offset, + &matrix, + transfer_function, + ) + } else { + cx = sse_channels_to_xyz_or_lab::( + cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + std::ptr::null_mut(), + 0usize, + &matrix, + transfer_function, + ) } } } diff --git a/src/image_xyza_laba.rs b/src/image_xyza_laba.rs index 3e634a4..60b08d7 100644 --- a/src/image_xyza_laba.rs +++ b/src/image_xyza_laba.rs @@ -56,24 +56,22 @@ fn channels_to_xyz_with_alpha( - cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - &matrix, - transfer_function, - ); - } + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + cx = sse_channels_to_xyza_laba::( + cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + &matrix, + transfer_function, + ); } } diff --git a/src/sse/cie.rs b/src/sse/cie.rs new file mode 100644 index 0000000..1981d93 --- /dev/null +++ b/src/sse/cie.rs @@ -0,0 +1,183 @@ +use crate::luv::{ + LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y, LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME, + LUV_WHITE_V_PRIME, +}; +use crate::sse::{ + _mm_atan2_ps, _mm_cbrt_ps, _mm_color_matrix_ps, _mm_cos_ps, _mm_cube_ps, _mm_hypot_ps, + _mm_prefer_fma_ps, _mm_select_ps, _mm_sin_ps, +}; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[inline(always)] +pub(crate) unsafe fn sse_triple_to_xyz( + r: __m128i, + g: __m128i, + b: __m128i, + c1: __m128, + c2: __m128, + c3: __m128, + c4: __m128, + c5: __m128, + c6: __m128, + c7: __m128, + c8: __m128, + c9: __m128, + transfer: &unsafe fn(__m128) -> __m128, +) -> (__m128, __m128, __m128) { + let u8_scale = _mm_set1_ps(1f32 / 255f32); + let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), u8_scale); + let g_f = _mm_mul_ps(_mm_cvtepi32_ps(g), u8_scale); + let b_f = _mm_mul_ps(_mm_cvtepi32_ps(b), u8_scale); + let r_linear = transfer(r_f); + let g_linear = transfer(g_f); + let b_linear = transfer(b_f); + + let (x, y, z) = _mm_color_matrix_ps( + r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9, + ); + (x, y, z) +} + +#[inline(always)] +pub(crate) unsafe fn sse_triple_to_luv( + x: __m128, + y: __m128, + z: __m128, +) -> (__m128, __m128, __m128) { + let zeros = _mm_setzero_ps(); + let den = _mm_prefer_fma_ps( + _mm_prefer_fma_ps(x, z, _mm_set1_ps(3f32)), + y, + _mm_set1_ps(15f32), + ); + let nan_mask = _mm_cmpeq_ps(den, _mm_set1_ps(0f32)); + let l_low_mask = _mm_cmplt_ps(y, _mm_set1_ps(LUV_CUTOFF_FORWARD_Y)); + let y_cbrt = _mm_cbrt_ps(y); + let l = _mm_select_ps( + l_low_mask, + _mm_mul_ps(y, _mm_set1_ps(LUV_MULTIPLIER_FORWARD_Y)), + _mm_prefer_fma_ps(_mm_set1_ps(-16f32), y_cbrt, _mm_set1_ps(116f32)), + ); + let u_prime = _mm_div_ps(_mm_mul_ps(x, _mm_set1_ps(4f32)), den); + let v_prime = _mm_div_ps(_mm_mul_ps(y, _mm_set1_ps(9f32)), den); + let sub_u_prime = _mm_sub_ps(u_prime, _mm_set1_ps(crate::luv::LUV_WHITE_U_PRIME)); + let sub_v_prime = _mm_sub_ps(v_prime, _mm_set1_ps(crate::luv::LUV_WHITE_V_PRIME)); + let l13 = _mm_mul_ps(l, _mm_set1_ps(13f32)); + let u = _mm_select_ps(nan_mask, zeros, _mm_mul_ps(l13, sub_u_prime)); + let v = _mm_select_ps(nan_mask, zeros, _mm_mul_ps(l13, sub_v_prime)); + (l, u, v) +} + +#[inline(always)] +pub(crate) unsafe fn sse_triple_to_lab( + x: __m128, + y: __m128, + z: __m128, +) -> (__m128, __m128, __m128) { + let x = _mm_mul_ps(x, _mm_set1_ps(100f32 / 95.047f32)); + let y = _mm_mul_ps(y, _mm_set1_ps(100f32 / 100f32)); + let z = _mm_mul_ps(z, _mm_set1_ps(100f32 / 108.883f32)); + let cbrt_x = _mm_cbrt_ps(x); + let cbrt_y = _mm_cbrt_ps(y); + let cbrt_z = _mm_cbrt_ps(z); + let s_1 = _mm_set1_ps(16.0 / 116.0); + let s_2 = _mm_set1_ps(7.787); + let lower_x = _mm_prefer_fma_ps(s_1, s_2, x); + let lower_y = _mm_prefer_fma_ps(s_1, s_2, y); + let lower_z = _mm_prefer_fma_ps(s_1, s_2, z); + let cutoff = _mm_set1_ps(0.008856f32); + let x = _mm_select_ps(_mm_cmpgt_ps(x, cutoff), cbrt_x, lower_x); + let y = _mm_select_ps(_mm_cmpgt_ps(y, cutoff), cbrt_y, lower_y); + let z = _mm_select_ps(_mm_cmpgt_ps(z, cutoff), cbrt_z, lower_z); + let l = _mm_prefer_fma_ps(_mm_set1_ps(-16.0f32), y, _mm_set1_ps(116.0f32)); + let a = _mm_mul_ps(_mm_sub_ps(x, y), _mm_set1_ps(500f32)); + let b = _mm_mul_ps(_mm_sub_ps(y, z), _mm_set1_ps(200f32)); + (l, a, b) +} + +#[inline(always)] +pub(crate) unsafe fn sse_triple_to_lch( + x: __m128, + y: __m128, + z: __m128, +) -> (__m128, __m128, __m128) { + let (luv_l, luv_u, luv_v) = sse_triple_to_luv(x, y, z); + let lch_c = _mm_hypot_ps(luv_u, luv_v); + let lch_h = _mm_atan2_ps(luv_v, luv_u); + (luv_l, lch_c, lch_h) +} + +#[inline(always)] +pub(crate) unsafe fn sse_lab_to_xyz(l: __m128, a: __m128, b: __m128) -> (__m128, __m128, __m128) { + let y = _mm_mul_ps( + _mm_add_ps(l, _mm_set1_ps(16f32)), + _mm_set1_ps(1f32 / 116f32), + ); + let x = _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(1f32 / 500f32)), y); + let z = _mm_sub_ps(y, _mm_mul_ps(b, _mm_set1_ps(1f32 / 200f32))); + let x3 = _mm_cube_ps(x); + let y3 = _mm_cube_ps(y); + let z3 = _mm_cube_ps(z); + let kappa = _mm_set1_ps(0.008856f32); + let k_sub = _mm_set1_ps(16f32 / 116f32); + let mult_1 = _mm_set1_ps(1f32 / 7.787f32); + let low_x = _mm_mul_ps(_mm_sub_ps(x, k_sub), mult_1); + let low_y = _mm_mul_ps(_mm_sub_ps(y, k_sub), mult_1); + let low_z = _mm_mul_ps(_mm_sub_ps(z, k_sub), mult_1); + + let x = _mm_select_ps(_mm_cmpgt_ps(x3, kappa), x3, low_x); + let y = _mm_select_ps(_mm_cmpgt_ps(y3, kappa), y3, low_y); + let z = _mm_select_ps(_mm_cmpgt_ps(z3, kappa), z3, low_z); + let x = _mm_mul_ps(x, _mm_set1_ps(95.047f32 / 100f32)); + let z = _mm_mul_ps(z, _mm_set1_ps(108.883f32 / 100f32)); + (x, y, z) +} + +#[inline(always)] +pub(crate) unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128, __m128, __m128) { + let zeros = _mm_setzero_ps(); + let zero_mask = _mm_cmpeq_ps(l, zeros); + let l13 = _mm_rcp_ps(_mm_mul_ps(l, _mm_set1_ps(13f32))); + let u = _mm_prefer_fma_ps(_mm_set1_ps(LUV_WHITE_U_PRIME), l13, u); + let v = _mm_prefer_fma_ps(_mm_set1_ps(LUV_WHITE_V_PRIME), l13, v); + let l_h = _mm_mul_ps( + _mm_add_ps(l, _mm_set1_ps(16f32)), + _mm_set1_ps(1f32 / 116f32), + ); + let y_high = _mm_mul_ps(_mm_mul_ps(l_h, l_h), l_h); + let y_low = _mm_mul_ps(l, _mm_set1_ps(LUV_MULTIPLIER_INVERSE_Y)); + let y = _mm_select_ps( + zero_mask, + zeros, + _mm_select_ps(_mm_cmpgt_ps(l, _mm_set1_ps(8f32)), y_high, y_low), + ); + let zero_mask_2 = _mm_cmpeq_ps(v, zeros); + let den = _mm_rcp_ps(_mm_mul_ps(v, _mm_set1_ps(4f32))); + let mut x = _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(y, u), den), _mm_set1_ps(9f32)); + x = _mm_select_ps(zero_mask, zeros, x); + x = _mm_select_ps(zero_mask_2, zeros, x); + let mut z = _mm_mul_ps( + _mm_mul_ps( + _mm_prefer_fma_ps( + _mm_prefer_fma_ps(_mm_set1_ps(12f32), _mm_set1_ps(-3f32), u), + v, + _mm_set1_ps(-20f32), + ), + y, + ), + den, + ); + z = _mm_select_ps(zero_mask, zeros, z); + z = _mm_select_ps(zero_mask_2, zeros, z); + (x, y, z) +} + +#[inline(always)] +pub(crate) unsafe fn sse_lch_to_xyz(l: __m128, c: __m128, h: __m128) -> (__m128, __m128, __m128) { + let u = _mm_mul_ps(c, _mm_cos_ps(h)); + let v = _mm_mul_ps(c, _mm_sin_ps(h)); + sse_luv_to_xyz(l, u, v) +} diff --git a/src/sse/color.rs b/src/sse/color.rs index aeb67bd..517f6e5 100644 --- a/src/sse/color.rs +++ b/src/sse/color.rs @@ -1,78 +1,9 @@ +use crate::sse::{_mm_abs_ps, _mm_fmod_ps, _mm_prefer_fma_ps, _mm_select_ps}; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use crate::luv::{LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME, LUV_WHITE_V_PRIME}; -use crate::sse::{_mm_abs_ps, _mm_cube_ps, _mm_fmod_ps, _mm_prefer_fma_ps, _mm_select_ps}; - -#[inline(always)] -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -pub(crate) unsafe fn sse_lab_to_xyz(l: __m128, a: __m128, b: __m128) -> (__m128, __m128, __m128) { - let y = _mm_mul_ps( - _mm_add_ps(l, _mm_set1_ps(16f32)), - _mm_set1_ps(1f32 / 116f32), - ); - let x = _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(1f32 / 500f32)), y); - let z = _mm_sub_ps(y, _mm_mul_ps(b, _mm_set1_ps(1f32 / 200f32))); - let x3 = _mm_cube_ps(x); - let y3 = _mm_cube_ps(y); - let z3 = _mm_cube_ps(z); - let kappa = _mm_set1_ps(0.008856f32); - let k_sub = _mm_set1_ps(16f32 / 116f32); - let mult_1 = _mm_set1_ps(1f32 / 7.787f32); - let low_x = _mm_mul_ps(_mm_sub_ps(x, k_sub), mult_1); - let low_y = _mm_mul_ps(_mm_sub_ps(y, k_sub), mult_1); - let low_z = _mm_mul_ps(_mm_sub_ps(z, k_sub), mult_1); - - let x = _mm_select_ps(_mm_cmpgt_ps(x3, kappa), x3, low_x); - let y = _mm_select_ps(_mm_cmpgt_ps(y3, kappa), y3, low_y); - let z = _mm_select_ps(_mm_cmpgt_ps(z3, kappa), z3, low_z); - let x = _mm_mul_ps(x, _mm_set1_ps(95.047f32 / 100f32)); - let z = _mm_mul_ps(z, _mm_set1_ps(108.883f32 / 100f32)); - (x, y, z) -} - -#[inline(always)] -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -pub(crate) unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128, __m128, __m128) { - let zeros = _mm_setzero_ps(); - let zero_mask = _mm_cmpeq_ps(l, zeros); - let l13 = _mm_rcp_ps(_mm_mul_ps(l, _mm_set1_ps(13f32))); - let u = _mm_prefer_fma_ps(_mm_set1_ps(LUV_WHITE_U_PRIME), l13, u); - let v = _mm_prefer_fma_ps(_mm_set1_ps(LUV_WHITE_V_PRIME), l13, v); - let l_h = _mm_mul_ps( - _mm_add_ps(l, _mm_set1_ps(16f32)), - _mm_set1_ps(1f32 / 116f32), - ); - let y_high = _mm_mul_ps(_mm_mul_ps(l_h, l_h), l_h); - let y_low = _mm_mul_ps(l, _mm_set1_ps(LUV_MULTIPLIER_INVERSE_Y)); - let y = _mm_select_ps( - zero_mask, - zeros, - _mm_select_ps(_mm_cmpgt_ps(l, _mm_set1_ps(8f32)), y_high, y_low), - ); - let zero_mask_2 = _mm_cmpeq_ps(v, zeros); - let den = _mm_rcp_ps(_mm_mul_ps(v, _mm_set1_ps(4f32))); - let mut x = _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(y, u), den), _mm_set1_ps(9f32)); - x = _mm_select_ps(zero_mask, zeros, x); - x = _mm_select_ps(zero_mask_2, zeros, x); - let mut z = _mm_mul_ps( - _mm_mul_ps( - _mm_prefer_fma_ps( - _mm_prefer_fma_ps(_mm_set1_ps(12f32), _mm_set1_ps(-3f32), u), - v, - _mm_set1_ps(-20f32), - ), - y, - ), - den, - ); - z = _mm_select_ps(zero_mask, zeros, z); - z = _mm_select_ps(zero_mask_2, zeros, z); - (x, y, z) -} - #[inline(always)] pub unsafe fn sse_hsl_to_rgb( h: __m128, diff --git a/src/sse/math.rs b/src/sse/math.rs index 1642010..8a1281a 100644 --- a/src/sse/math.rs +++ b/src/sse/math.rs @@ -444,7 +444,6 @@ pub unsafe fn _mm_cos_ps(d: __m128) -> __m128 { let s = _mm_mul_ps(d, d); - // TODO: Perform float masking instead d = _mm_castsi128_ps(_mm_xor_si128( _mm_and_si128( _mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(2)), _mm_set1_epi32(0)), @@ -464,3 +463,139 @@ pub unsafe fn _mm_cos_ps(d: __m128) -> __m128 { return u; } + +#[inline(always)] +pub unsafe fn _mm_hypot_ps(x: __m128, y: __m128) -> __m128 { + let xp2 = _mm_mul_ps(x, x); + let yp2 = _mm_mul_ps(y, y); + let z = _mm_add_ps(xp2, yp2); + return _mm_sqrt_ps(z); +} + +#[inline(always)] +pub unsafe fn _mm_poly4_ps( + x: __m128, + x2: __m128, + c3: __m128, + c2: __m128, + c1: __m128, + c0: __m128, +) -> __m128 { + _mm_fmaf_ps(x2, _mm_fmaf_ps(x, c3, c2), _mm_fmaf_ps(x, c1, c0)) +} + +#[inline(always)] +pub unsafe fn _mm_poly8q_ps( + x: __m128, + x2: __m128, + x4: __m128, + c7: __m128, + c6: __m128, + c5: __m128, + c4: __m128, + c3: __m128, + c2: __m128, + c1: __m128, + c0: __m128, +) -> __m128 { + _mm_fmaf_ps( + x4, + _mm_poly4_ps(x, x2, c7, c6, c5, c4), + _mm_poly4_ps(x, x2, c3, c2, c1, c0), + ) +} + +#[inline(always)] +unsafe fn _mm_atan2q_ps_impl(y: __m128, x: __m128) -> __m128 { + let q = _mm_select_si128( + _mm_castps_si128(_mm_cmplt_ps(x, _mm_setzero_ps())), + _mm_set1_epi32(-2), + _mm_set1_epi32(0), + ); + let x = _mm_abs_ps(x); + let is_y_more_than_x = _mm_cmpgt_ps(y, x); + let t = _mm_select_ps(is_y_more_than_x, x, _mm_setzero_ps()); + let x = _mm_select_ps(is_y_more_than_x, y, x); + let y = _mm_select_ps(is_y_more_than_x, _mm_neg_ps(t), y); + let q = _mm_select_si128( + _mm_castps_si128(is_y_more_than_x), + _mm_add_epi32(q, _mm_set1_epi32(1)), + q, + ); + let s = _mm_div_ps(y, x); + let t = _mm_mul_ps(s, s); + let t2 = _mm_mul_ps(t, t); + let t4 = _mm_mul_ps(t2, t2); + let poly = _mm_poly8q_ps( + t, + t2, + t4, + _mm_set1_ps(0.00282363896258175373077393f32), + _mm_set1_ps(-0.0159569028764963150024414f32), + _mm_set1_ps(0.0425049886107444763183594f32), + _mm_set1_ps(-0.0748900920152664184570312f32), + _mm_set1_ps(0.106347933411598205566406f32), + _mm_set1_ps(-0.142027363181114196777344f32), + _mm_set1_ps(0.199926957488059997558594f32), + _mm_set1_ps(-0.333331018686294555664062f32), + ); + let t = _mm_prefer_fma_ps(s, _mm_mul_ps(poly, t), s); + let t = _mm_prefer_fma_ps( + t, + _mm_cvtepi32_ps(q), + _mm_set1_ps(std::f32::consts::FRAC_PI_2), + ); + t +} + +#[inline(always)] +pub unsafe fn _mm_atan2_ps(y: __m128, x: __m128) -> __m128 { + let r = _mm_atan2q_ps_impl(_mm_abs_ps(y), x); + let r = _mm_mulsign_ps(r, x); + _mm_mulsign_ps(r, y) +} + +#[inline(always)] +pub unsafe fn _mm_sin_ps(val: __m128) -> __m128 { + let pi_v = _mm_set1_ps(std::f32::consts::PI); + let pio2_v = _mm_set1_ps(std::f32::consts::FRAC_PI_2); + let ipi_v = _mm_set1_ps(std::f32::consts::FRAC_1_PI); + + //Find positive or negative + let c_v = _mm_abs_epi32(_mm_cvtps_epi32(_mm_mul_ps(val, ipi_v))); + let sign_v = _mm_castps_si128(_mm_cmple_ps(val, _mm_setzero_ps())); + let odd_v = _mm_and_si128(c_v, _mm_set1_epi32(1)); + + let neg_v = _mm_xor_si128(odd_v, sign_v); + + //Modulus a - (n * int(a*(1/n))) + let mut ma = _mm_sub_ps(_mm_abs_ps(val), _mm_mul_ps(pi_v, _mm_cvtepi32_ps(c_v))); + let reb_v = _mm_cmpge_ps(ma, pio2_v); + + //Rebase a between 0 and pi/2 + ma = _mm_select_ps(reb_v, _mm_sub_ps(pi_v, ma), ma); + + //Taylor series + let ma2 = _mm_mul_ps(ma, ma); + + //2nd elem: x^3 / 3! + let mut elem = _mm_mul_ps(_mm_mul_ps(ma, ma2), _mm_set1_ps(0.166666666666f32)); + let mut res = _mm_sub_ps(ma, elem); + + //3rd elem: x^5 / 5! + elem = _mm_mul_ps(_mm_mul_ps(elem, ma2), _mm_set1_ps(0.05f32)); + res = _mm_add_ps(res, elem); + + //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val) + elem = _mm_mul_ps(_mm_mul_ps(elem, ma2), _mm_set1_ps(0.023809523810f32)); + res = _mm_sub_ps(res, elem); + + //5th elem: x^9 / 9! + elem = _mm_mul_ps(_mm_mul_ps(elem, ma2), _mm_set1_ps(0.013888888889f32)); + res = _mm_add_ps(res, elem); + + //Change of sign + let neg_v = _mm_slli_epi32::<31>(neg_v); + res = _mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(res), neg_v)); + return res; +} diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 5c13e7b..024129b 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -31,6 +31,7 @@ mod to_xyza_laba; mod xyz_lab_to_image; +mod cie; mod from_sigmoidal; mod sigmoidal; mod to_sigmoidal; diff --git a/src/sse/to_xyz_lab.rs b/src/sse/to_xyz_lab.rs index 72e1064..9c374ee 100644 --- a/src/sse/to_xyz_lab.rs +++ b/src/sse/to_xyz_lab.rs @@ -1,100 +1,13 @@ use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget; -use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y}; +use crate::sse::cie::{sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz}; use crate::sse::*; +use crate::xyz_target::XyzTarget; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -#[inline(always)] -pub(crate) unsafe fn sse_triple_to_xyz( - r: __m128i, - g: __m128i, - b: __m128i, - c1: __m128, - c2: __m128, - c3: __m128, - c4: __m128, - c5: __m128, - c6: __m128, - c7: __m128, - c8: __m128, - c9: __m128, - transfer: &unsafe fn(__m128) -> __m128, -) -> (__m128, __m128, __m128) { - let u8_scale = _mm_set1_ps(1f32 / 255f32); - let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), u8_scale); - let g_f = _mm_mul_ps(_mm_cvtepi32_ps(g), u8_scale); - let b_f = _mm_mul_ps(_mm_cvtepi32_ps(b), u8_scale); - let r_linear = transfer(r_f); - let g_linear = transfer(g_f); - let b_linear = transfer(b_f); - - let (x, y, z) = _mm_color_matrix_ps( - r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9, - ); - (x, y, z) -} - -#[inline(always)] -pub(crate) unsafe fn sse_triple_to_luv( - x: __m128, - y: __m128, - z: __m128, -) -> (__m128, __m128, __m128) { - let zeros = _mm_setzero_ps(); - let den = _mm_prefer_fma_ps( - _mm_prefer_fma_ps(x, z, _mm_set1_ps(3f32)), - y, - _mm_set1_ps(15f32), - ); - let nan_mask = _mm_cmpeq_ps(den, _mm_set1_ps(0f32)); - let l_low_mask = _mm_cmplt_ps(y, _mm_set1_ps(LUV_CUTOFF_FORWARD_Y)); - let y_cbrt = _mm_cbrt_ps(y); - let l = _mm_select_ps( - l_low_mask, - _mm_mul_ps(y, _mm_set1_ps(LUV_MULTIPLIER_FORWARD_Y)), - _mm_prefer_fma_ps(_mm_set1_ps(-16f32), y_cbrt, _mm_set1_ps(116f32)), - ); - let u_prime = _mm_div_ps(_mm_mul_ps(x, _mm_set1_ps(4f32)), den); - let v_prime = _mm_div_ps(_mm_mul_ps(y, _mm_set1_ps(9f32)), den); - let sub_u_prime = _mm_sub_ps(u_prime, _mm_set1_ps(crate::luv::LUV_WHITE_U_PRIME)); - let sub_v_prime = _mm_sub_ps(v_prime, _mm_set1_ps(crate::luv::LUV_WHITE_V_PRIME)); - let l13 = _mm_mul_ps(l, _mm_set1_ps(13f32)); - let u = _mm_select_ps(nan_mask, zeros, _mm_mul_ps(l13, sub_u_prime)); - let v = _mm_select_ps(nan_mask, zeros, _mm_mul_ps(l13, sub_v_prime)); - (l, u, v) -} - -#[inline(always)] -pub(crate) unsafe fn sse_triple_to_lab( - x: __m128, - y: __m128, - z: __m128, -) -> (__m128, __m128, __m128) { - let x = _mm_mul_ps(x, _mm_set1_ps(100f32 / 95.047f32)); - let y = _mm_mul_ps(y, _mm_set1_ps(100f32 / 100f32)); - let z = _mm_mul_ps(z, _mm_set1_ps(100f32 / 108.883f32)); - let cbrt_x = _mm_cbrt_ps(x); - let cbrt_y = _mm_cbrt_ps(y); - let cbrt_z = _mm_cbrt_ps(z); - let s_1 = _mm_set1_ps(16.0 / 116.0); - let s_2 = _mm_set1_ps(7.787); - let lower_x = _mm_prefer_fma_ps(s_1, s_2, x); - let lower_y = _mm_prefer_fma_ps(s_1, s_2, y); - let lower_z = _mm_prefer_fma_ps(s_1, s_2, z); - let cutoff = _mm_set1_ps(0.008856f32); - let x = _mm_select_ps(_mm_cmpgt_ps(x, cutoff), cbrt_x, lower_x); - let y = _mm_select_ps(_mm_cmpgt_ps(y, cutoff), cbrt_y, lower_y); - let z = _mm_select_ps(_mm_cmpgt_ps(z, cutoff), cbrt_z, lower_z); - let l = _mm_prefer_fma_ps(_mm_set1_ps(-16.0f32), y, _mm_set1_ps(116.0f32)); - let a = _mm_mul_ps(_mm_sub_ps(x, y), _mm_set1_ps(500f32)); - let b = _mm_mul_ps(_mm_sub_ps(y, z), _mm_set1_ps(200f32)); - (l, a, b) -} - #[inline(always)] pub unsafe fn sse_channels_to_xyz_or_lab< const CHANNELS_CONFIGURATION: u8, @@ -199,6 +112,12 @@ pub unsafe fn sse_channels_to_xyz_or_lab< y_low_low = u; z_low_low = v; } + XyzTarget::LCH => { + let (l, c, h) = sse_triple_to_lch(x_low_low, y_low_low, z_low_low); + x_low_low = l; + y_low_low = c; + z_low_low = h; + } } let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_low, y_low_low, z_low_low); @@ -229,6 +148,12 @@ pub unsafe fn sse_channels_to_xyz_or_lab< y_low_high = u; z_low_high = v; } + XyzTarget::LCH => { + let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high); + x_low_high = l; + y_low_high = c; + z_low_high = h; + } } let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_high, y_low_high, z_low_high); @@ -263,6 +188,12 @@ pub unsafe fn sse_channels_to_xyz_or_lab< y_high_low = u; z_high_low = v; } + XyzTarget::LCH => { + let (l, c, h) = sse_triple_to_lch(x_high_low, y_high_low, z_high_low); + x_high_low = l; + y_high_low = c; + z_high_low = h; + } } let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_low, y_high_low, z_high_low); @@ -304,6 +235,12 @@ pub unsafe fn sse_channels_to_xyz_or_lab< y_high_high = u; z_high_high = v; } + XyzTarget::LCH => { + let (l, c, h) = sse_triple_to_lch(x_high_high, y_high_high, z_high_high); + x_high_high = l; + y_high_high = c; + z_high_high = h; + } } let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_high, y_high_high, z_high_high); diff --git a/src/sse/to_xyza_laba.rs b/src/sse/to_xyza_laba.rs index bbd259d..02255f7 100644 --- a/src/sse/to_xyza_laba.rs +++ b/src/sse/to_xyza_laba.rs @@ -1,18 +1,13 @@ -#[allow(unused_imports)] use crate::gamma_curves::TransferFunction; -#[allow(unused_imports)] use crate::image::ImageConfiguration; -#[allow(unused_imports)] -use crate::image_to_xyz_lab::XyzTarget; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[allow(unused_imports)] +use crate::sse::cie::{sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz}; use crate::sse::*; +use crate::xyz_target::XyzTarget; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub unsafe fn sse_channels_to_xyza_laba( start_cx: usize, @@ -110,6 +105,12 @@ pub unsafe fn sse_channels_to_xyza_laba { + let (l, c, h) = sse_triple_to_lch(x_low_low, y_low_low, z_low_low); + x_low_low = l; + y_low_low = c; + z_low_low = h; + } } let a_low = _mm_cvtepu8_epi16(a_chan); @@ -145,6 +146,12 @@ pub unsafe fn sse_channels_to_xyza_laba { + let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high); + x_low_high = l; + y_low_high = c; + z_low_high = h; + } } let a_low_high = _mm_mul_ps( @@ -186,6 +193,12 @@ pub unsafe fn sse_channels_to_xyza_laba { + let (l, c, h) = sse_triple_to_lch(x_high_low, y_high_low, z_high_low); + x_high_low = l; + y_high_low = c; + z_high_low = h; + } } let a_high = _mm_unpackhi_epi8(a_chan, _mm_setzero_si128()); @@ -233,6 +246,12 @@ pub unsafe fn sse_channels_to_xyza_laba { + let (l, c, h) = sse_triple_to_lch(x_high_high, y_high_high, z_high_high); + x_high_high = l; + y_high_high = c; + z_high_high = h; + } } let a_high_high = _mm_mul_ps( diff --git a/src/sse/xyz_lab_to_image.rs b/src/sse/xyz_lab_to_image.rs index 536fdf6..7d7f4a4 100644 --- a/src/sse/xyz_lab_to_image.rs +++ b/src/sse/xyz_lab_to_image.rs @@ -1,10 +1,10 @@ use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget; -use crate::sse::color::{sse_lab_to_xyz, sse_luv_to_xyz}; +use crate::sse::cie::{sse_lab_to_xyz, sse_lch_to_xyz, sse_luv_to_xyz}; use crate::sse::{ _mm_color_matrix_ps, get_sse_gamma_transfer, sse_deinterleave_rgb_ps, sse_interleave_rgb, sse_interleave_rgba, }; +use crate::xyz_target::XyzTarget; use crate::TransferFunction; #[cfg(target_arch = "x86")] use std::arch::x86::*; @@ -51,6 +51,12 @@ unsafe fn sse_xyz_lab_vld< g_f32 = y; b_f32 = z; } + XyzTarget::LCH => { + let (x, y, z) = sse_lch_to_xyz(r_f32, g_f32, b_f32); + r_f32 = x; + g_f32 = y; + b_f32 = z; + } _ => {} } diff --git a/src/sse/xyza_laba_to_image.rs b/src/sse/xyza_laba_to_image.rs index be74ffa..13c746c 100644 --- a/src/sse/xyza_laba_to_image.rs +++ b/src/sse/xyza_laba_to_image.rs @@ -1,9 +1,9 @@ use crate::image::ImageConfiguration; -use crate::image_to_xyz_lab::XyzTarget; -use crate::sse::color::{sse_lab_to_xyz, sse_luv_to_xyz}; +use crate::sse::cie::{sse_lab_to_xyz, sse_lch_to_xyz, sse_luv_to_xyz}; use crate::sse::{ _mm_color_matrix_ps, get_sse_gamma_transfer, sse_deinterleave_rgba_ps, sse_interleave_rgba, }; +use crate::xyz_target::XyzTarget; use crate::TransferFunction; #[cfg(target_arch = "x86")] use std::arch::x86::*; @@ -47,6 +47,12 @@ unsafe fn sse_xyza_lab_vld( g_f32 = y; b_f32 = z; } + XyzTarget::LCH => { + let (x, y, z) = sse_lch_to_xyz(r_f32, g_f32, b_f32); + r_f32 = x; + g_f32 = y; + b_f32 = z; + } _ => {} } diff --git a/src/xyz_lab_to_image.rs b/src/xyz_lab_to_image.rs index 9bf747c..70ebb20 100644 --- a/src/xyz_lab_to_image.rs +++ b/src/xyz_lab_to_image.rs @@ -112,40 +112,40 @@ fn xyz_to_channels( - cx, - src.as_ptr(), - src_offset, - a_channel.as_ptr(), - a_offset, - dst.as_mut_ptr(), - dst_offset, - width, - &matrix, - transfer_function, - ) - } else { - cx = sse_xyz_to_channels::( - cx, - src.as_ptr(), - src_offset, - std::ptr::null(), - 0usize, - dst.as_mut_ptr(), - dst_offset, - width, - &matrix, - transfer_function, - ) - } + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + if USE_ALPHA { + cx = sse_xyz_to_channels::( + cx, + src.as_ptr(), + src_offset, + a_channel.as_ptr(), + a_offset, + dst.as_mut_ptr(), + dst_offset, + width, + &matrix, + transfer_function, + ) + } else { + cx = sse_xyz_to_channels::( + cx, + src.as_ptr(), + src_offset, + std::ptr::null(), + 0usize, + dst.as_mut_ptr(), + dst_offset, + width, + &matrix, + transfer_function, + ) } } } diff --git a/src/xyz_target.rs b/src/xyz_target.rs index 63af39a..66a251c 100644 --- a/src/xyz_target.rs +++ b/src/xyz_target.rs @@ -1,5 +1,5 @@ #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub(crate) enum XyzTarget { +pub enum XyzTarget { LAB = 0, XYZ = 1, LUV = 2, diff --git a/src/xyza_laba_to_image.rs b/src/xyza_laba_to_image.rs index f0511ba..a49ec55 100644 --- a/src/xyza_laba_to_image.rs +++ b/src/xyza_laba_to_image.rs @@ -90,32 +90,15 @@ fn xyz_with_alpha_to_channels( - cx, - src.as_ptr(), - src_offset, - dst.as_mut_ptr(), - dst_offset, - width, - &matrix, - transfer_function, - ) - } - } - - #[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" - ))] - unsafe { - cx = neon_xyza_to_image::( + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + cx = sse_xyza_to_image::( cx, src.as_ptr(), src_offset, @@ -128,6 +111,23 @@ fn xyz_with_alpha_to_channels( + cx, + src.as_ptr(), + src_offset, + dst.as_mut_ptr(), + dst_offset, + width, + &matrix, + transfer_function, + ) + } + let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 }; let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) };