diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 7314e2e..c5b2f28 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -65,7 +65,7 @@ fn main() { store_stride as u32, width, height, - TransferFunction::Srgb, + TransferFunction::Gamma2p8, ); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -100,7 +100,7 @@ fn main() { src_stride, width, height, - TransferFunction::Srgb + TransferFunction::Gamma2p8 ); let elapsed_time = start_time.elapsed(); diff --git a/src/avx/avx2_to_xyz_lab.rs b/src/avx/avx2_to_xyz_lab.rs index acce841..13e6e83 100644 --- a/src/avx/avx2_to_xyz_lab.rs +++ b/src/avx/avx2_to_xyz_lab.rs @@ -1,5 +1,10 @@ -use crate::avx::avx_gamma_curves::{avx2_rec709_to_linear, avx2_srgb_to_linear}; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + use crate::avx::*; +use crate::avx::avx_gamma_curves::get_avx2_linear_transfer; #[allow(unused_imports)] use crate::gamma_curves::TransferFunction; #[allow(unused_imports)] @@ -7,26 +12,8 @@ use crate::image::ImageConfiguration; #[allow(unused_imports)] use crate::image_to_xyz_lab::XyzTarget; use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y}; -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(always)] -#[allow(dead_code)] -pub unsafe fn get_avx2_linear_transfer( - transfer_function: TransferFunction, -) -> unsafe fn(__m256) -> __m256 { - match transfer_function { - TransferFunction::Srgb => avx2_srgb_to_linear, - TransferFunction::Rec709 => avx2_rec709_to_linear, - } -} -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] unsafe fn avx2_triple_to_xyz( r: __m256i, g: __m256i, @@ -56,7 +43,6 @@ unsafe fn avx2_triple_to_xyz( (x, y, z) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub(crate) unsafe fn avx2_triple_to_luv( x: __m256, @@ -87,9 +73,7 @@ pub(crate) unsafe fn avx2_triple_to_luv( (l, u, v) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] unsafe fn avx2_triple_to_lab(x: __m256, y: __m256, z: __m256) -> (__m256, __m256, __m256) { let x = _mm256_mul_ps(x, _mm256_set1_ps(100f32 / 95.047f32)); let y = _mm256_mul_ps(y, _mm256_set1_ps(100f32 / 100f32)); @@ -112,10 +96,8 @@ unsafe fn avx2_triple_to_lab(x: __m256, y: __m256, z: __m256) -> (__m256, __m256 (l, a, b) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] -pub(crate) unsafe fn avx2_channels_to_xyz_or_lab< +pub unsafe fn avx2_image_to_xyz_lab< const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, const TARGET: u8, diff --git a/src/avx/avx_gamma_curves.rs b/src/avx/avx_gamma_curves.rs index 27e234b..143ac66 100644 --- a/src/avx/avx_gamma_curves.rs +++ b/src/avx/avx_gamma_curves.rs @@ -6,7 +6,6 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 { let low_cut_off = _mm256_set1_ps(0.0030412825601275209f32); @@ -26,7 +25,6 @@ pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 { return _mm256_select_ps(mask, high, low); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 { let low_cut_off = _mm256_set1_ps(12.92f32 * 0.0030412825601275209f32); @@ -44,7 +42,6 @@ pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 { return _mm256_select_ps(mask, high, low); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 { let low_cut_off = _mm256_set1_ps(0.018053968510807f32); @@ -64,16 +61,15 @@ pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 { return _mm256_select_ps(mask, high, low); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -pub unsafe fn avx2_rec709_to_linear(linear: __m256) -> __m256 { +pub unsafe fn avx2_rec709_to_linear(gamma: __m256) -> __m256 { let low_cut_off = _mm256_set1_ps(4.5f32 * 0.018053968510807f32); - let mask = _mm256_cmp_ps::<_CMP_GE_OS>(linear, low_cut_off); + let mask = _mm256_cmp_ps::<_CMP_GE_OS>(gamma, low_cut_off); - let mut low = linear; + let mut low = gamma; let high = _mm256_pow_n_ps( _mm256_mul_ps( - _mm256_add_ps(linear, _mm256_set1_ps(0.09929682680944f32)), + _mm256_add_ps(gamma, _mm256_set1_ps(0.09929682680944f32)), _mm256_set1_ps(1f32 / 1.09929682680944f32), ), 1.0f32 / 0.45f32, @@ -82,7 +78,26 @@ pub unsafe fn avx2_rec709_to_linear(linear: __m256) -> __m256 { return _mm256_select_ps(mask, high, low); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +pub unsafe fn avx2_gamma2p2_to_linear(gamma: __m256) -> __m256 { + _mm256_pow_n_ps(gamma, 2.2f32) +} + +#[inline(always)] +pub unsafe fn avx2_gamma2p8_to_linear(gamma: __m256) -> __m256 { + _mm256_pow_n_ps(gamma, 2.8f32) +} + +#[inline(always)] +pub unsafe fn avx2_gamma2p2_from_linear(linear: __m256) -> __m256 { + _mm256_pow_n_ps(linear, 1f32 / 2.2f32) +} + +#[inline(always)] +pub unsafe fn avx2_gamma2p8_from_linear(linear: __m256) -> __m256 { + _mm256_pow_n_ps(linear, 1f32 / 2.8f32) +} + #[inline(always)] pub unsafe fn get_avx_gamma_transfer( transfer_function: TransferFunction, @@ -90,5 +105,19 @@ pub unsafe fn get_avx_gamma_transfer( match transfer_function { TransferFunction::Srgb => avx2_srgb_from_linear, TransferFunction::Rec709 => avx2_rec709_from_linear, + TransferFunction::Gamma2p2 => avx2_gamma2p2_from_linear, + TransferFunction::Gamma2p8 => avx2_gamma2p8_from_linear, + } +} + +#[inline(always)] +pub unsafe fn get_avx2_linear_transfer( + transfer_function: TransferFunction, +) -> unsafe fn(__m256) -> __m256 { + match transfer_function { + TransferFunction::Srgb => avx2_srgb_to_linear, + TransferFunction::Rec709 => avx2_rec709_to_linear, + TransferFunction::Gamma2p2 => avx2_gamma2p2_to_linear, + TransferFunction::Gamma2p8 => avx2_gamma2p8_to_linear, } } diff --git a/src/avx/avx_math.rs b/src/avx/avx_math.rs index 3eae780..b659269 100644 --- a/src/avx/avx_math.rs +++ b/src/avx/avx_math.rs @@ -23,7 +23,6 @@ pub unsafe fn _mm256_prefer_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 { return _mm256_fmadd_ps(b, c, a); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] unsafe fn _mm256_taylorpoly_ps( x: __m256, @@ -50,7 +49,6 @@ unsafe fn _mm256_taylorpoly_ps( return res; } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 { let const_ln127 = _mm256_set1_epi32(127); // 127 @@ -79,9 +77,7 @@ pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 { poly } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm256_select_ps(mask: __m256, true_vals: __m256, false_vals: __m256) -> __m256 { _mm256_blendv_ps(false_vals, true_vals, mask) } @@ -107,9 +103,7 @@ pub unsafe fn _mm256_select_si256( ) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm256_exp_ps(x: __m256) -> __m256 { let c1 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f let c2 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f @@ -271,9 +265,7 @@ pub(crate) unsafe fn _mm256_neg_epi32(x: __m256i) -> __m256i { return _mm256_sub_epi32(high, x); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] /// This is Cube Root using Pow functions, /// it also precise however due to of inexact nature of power 1/3 result slightly differ /// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5 diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs index 2386415..df44451 100644 --- a/src/image_to_xyz_lab.rs +++ b/src/image_to_xyz_lab.rs @@ -2,7 +2,7 @@ any(target_arch = "x86_64", target_arch = "x86"), target_feature = "avx2" ))] -use crate::avx::avx2_channels_to_xyz_or_lab; +use crate::avx::avx2_image_to_xyz_lab; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ}; @@ -103,7 +103,7 @@ fn channels_to_xyz( + cx = avx2_image_to_xyz_lab::( cx, src.as_ptr(), src_offset, @@ -116,7 +116,7 @@ fn channels_to_xyz( + cx = avx2_image_to_xyz_lab::( cx, src.as_ptr(), src_offset, diff --git a/src/sse/gamma_curves.rs b/src/sse/gamma_curves.rs index 9863db0..e0e9df0 100644 --- a/src/sse/gamma_curves.rs +++ b/src/sse/gamma_curves.rs @@ -77,12 +77,35 @@ pub unsafe fn sse_rec709_to_linear(linear: __m128) -> __m128 { return _mm_select_ps(mask, high, low); } +#[inline(always)] +pub unsafe fn sse_gamma2p2_to_linear(gamma: __m128) -> __m128 { + _mm_pow_n_ps(gamma, 2.2f32) +} + +#[inline(always)] +pub unsafe fn sse_gamma2p8_to_linear(gamma: __m128) -> __m128 { + _mm_pow_n_ps(gamma, 2.8f32) +} + +#[inline(always)] +pub unsafe fn sse_gamma2p2_from_linear(linear: __m128) -> __m128 { + _mm_pow_n_ps(linear, 1f32 / 2.2f32) +} + +#[inline(always)] +pub unsafe fn sse_gamma2p8_from_linear(linear: __m128) -> __m128 { + _mm_pow_n_ps(linear, 1f32 / 2.8f32) +} + +#[inline(always)] pub unsafe fn get_sse_linear_transfer( transfer_function: TransferFunction, ) -> unsafe fn(__m128) -> __m128 { match transfer_function { TransferFunction::Srgb => sse_srgb_to_linear, TransferFunction::Rec709 => sse_rec709_to_linear, + TransferFunction::Gamma2p2 => sse_gamma2p2_to_linear, + TransferFunction::Gamma2p8 => sse_gamma2p8_to_linear, } } @@ -93,5 +116,7 @@ pub unsafe fn get_sse_gamma_transfer( match transfer_function { TransferFunction::Srgb => sse_srgb_from_linear, TransferFunction::Rec709 => sse_rec709_from_linear, + TransferFunction::Gamma2p2 => sse_gamma2p2_from_linear, + TransferFunction::Gamma2p8 => sse_gamma2p8_from_linear, } } diff --git a/src/sse/math.rs b/src/sse/math.rs index fb4605e..9866c4d 100644 --- a/src/sse/math.rs +++ b/src/sse/math.rs @@ -9,25 +9,19 @@ pub unsafe fn _mm_cube_ps(x: __m128) -> __m128 { _mm_mul_ps(_mm_mul_ps(x, x), x) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[cfg(not(target_feature = "fma"))] #[inline] -#[allow(dead_code)] pub unsafe fn _mm_prefer_fma_ps(a: __m128, b: __m128, c: __m128) -> __m128 { return _mm_add_ps(_mm_mul_ps(b, c), a); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[cfg(target_feature = "fma")] #[inline] -#[allow(dead_code)] pub unsafe fn _mm_prefer_fma_ps(a: __m128, b: __m128, c: __m128) -> __m128 { return _mm_fmadd_ps(b, c, a); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline] -#[allow(dead_code)] unsafe fn _mm_taylorpoly_ps( x: __m128, poly0: __m128, @@ -49,9 +43,7 @@ unsafe fn _mm_taylorpoly_ps( return res; } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_log_ps(v: __m128) -> __m128 { let const_ln127 = _mm_set1_epi32(127); // 127 let const_ln2 = _mm_set1_ps(std::f32::consts::LN_2); // ln(2) @@ -76,9 +68,7 @@ pub unsafe fn _mm_log_ps(v: __m128) -> __m128 { poly } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_select_ps(mask: __m128, true_vals: __m128, false_vals: __m128) -> __m128 { _mm_blendv_ps(false_vals, true_vals, mask) } @@ -100,9 +90,7 @@ pub unsafe fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __ ) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_exp_ps(x: __m128) -> __m128 { let c1 = _mm_castsi128_ps(_mm_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f let c2 = _mm_castsi128_ps(_mm_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f @@ -259,9 +247,7 @@ pub(crate) unsafe fn _mm_neg_ps(x: __m128) -> __m128 { return _mm_sub_ps(high, x); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] /// This is Cube Root using Pow functions, /// it is also precise however due to of inexact nature of power 1/3 result slightly differ /// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5