diff --git a/src/avx/mod.rs b/src/avx/mod.rs index 516bf59..8d56940 100644 --- a/src/avx/mod.rs +++ b/src/avx/mod.rs @@ -11,6 +11,7 @@ mod gamma_curves; mod image_to_oklab; mod linear_to_image; mod math; +mod oklab_to_image; mod routines; mod sigmoidal; mod support; @@ -20,12 +21,12 @@ mod to_xyz_lab; mod utils; mod xyz_lab_to_image; mod xyza_laba_to_image; -mod oklab_to_image; pub use from_sigmoidal::avx_from_sigmoidal_row; pub use image_to_oklab::avx_image_to_oklab; pub use linear_to_image::avx_linear_to_gamma; pub use math::*; +pub use oklab_to_image::avx_oklab_to_image; pub use support::*; pub use to_linear::avx_channels_to_linear; pub use to_sigmoidal::avx_image_to_sigmoidal_row; @@ -33,4 +34,3 @@ pub use to_xyz_lab::*; pub use utils::*; pub use xyz_lab_to_image::*; pub use xyza_laba_to_image::*; -pub use oklab_to_image::avx_oklab_to_image; \ No newline at end of file diff --git a/src/avx/oklab_to_image.rs b/src/avx/oklab_to_image.rs index ccbea0e..a9e62d7 100644 --- a/src/avx/oklab_to_image.rs +++ b/src/avx/oklab_to_image.rs @@ -11,12 +11,20 @@ use std::arch::x86_64::*; use erydanos::{_mm256_cos_ps, _mm256_sin_ps}; -use crate::{avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8, avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction, XYZ_TO_SRGB_D65}; -use crate::avx::{_mm256_color_matrix_ps, _mm256_cube_ps, avx2_pack_u16, avx2_pack_u32, avx2_interleave_rgba_epi8, avx2_interleave_rgb}; use crate::avx::gamma_curves::get_avx_gamma_transfer; use crate::avx::routines::avx_vld_f32_and_deinterleave_direct; +use crate::avx::{ + _mm256_color_matrix_ps, _mm256_cube_ps, avx2_interleave_rgb, avx2_interleave_rgba_epi8, + avx2_pack_u16, avx2_pack_u32, +}; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; +use crate::{ + avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8, + avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8, + avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction, + XYZ_TO_SRGB_D65, +}; #[inline(always)] unsafe fn avx_oklab_vld( @@ -54,7 +62,8 @@ unsafe fn avx_oklab_vld( let v_scale_alpha = _mm256_set1_ps(255f32); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (l, mut a, mut b, mut a_f32) = avx_vld_f32_and_deinterleave_direct::(src); + let (l, mut a, mut b, mut a_f32) = + avx_vld_f32_and_deinterleave_direct::(src); if oklab_target == OklabTarget::OKLCH { let a0 = _mm256_mul_ps(a, _mm256_cos_ps(b)); @@ -164,141 +173,29 @@ pub unsafe fn avx_oklab_to_image( - src_ptr_0, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_0, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let src_ptr_1 = offset_src_ptr.add(8 * channels); let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::( - src_ptr_1, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_1, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let src_ptr_2 = offset_src_ptr.add(8 * 2 * channels); let (r_row2_, g_row2_, b_row2_, a_row2_) = avx_oklab_vld::( - src_ptr_2, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_2, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let src_ptr_3 = offset_src_ptr.add(8 * 3 * channels); let (r_row3_, g_row3_, b_row3_, a_row3_) = avx_oklab_vld::( - src_ptr_3, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_3, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let r_row01 = avx2_pack_u32(r_row0_, r_row1_); @@ -319,7 +216,14 @@ pub unsafe fn avx_oklab_to_image( - src_ptr_0, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_0, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let src_ptr_1 = offset_src_ptr.add(8 * channels); let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::( - src_ptr_1, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_1, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let r_row01 = avx2_pack_u32(r_row0_, r_row1_); @@ -436,36 +284,8 @@ pub unsafe fn avx_oklab_to_image( - src_ptr_0, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_0, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let r_row01 = avx2_pack_u32(r_row0_, zeros); @@ -490,7 +310,13 @@ pub unsafe fn avx_oklab_to_image( let transfer = get_neon_gamma_transfer(transfer_function); let v_scale_alpha = vdupq_n_f32(255f32); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (jz, mut az, mut bz, mut a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration); + let (jz, mut az, mut bz, mut a_f32) = + load_f32_and_deinterleave_direct!(src, image_configuration); if target == JzazbzTarget::JZCZHZ { let cz = az; diff --git a/src/neon/oklab_to_image.rs b/src/neon/oklab_to_image.rs index b687a66..dbc97df 100644 --- a/src/neon/oklab_to_image.rs +++ b/src/neon/oklab_to_image.rs @@ -8,11 +8,11 @@ use std::arch::aarch64::*; use erydanos::{vcosq_f32, vsinq_f32}; -use crate::{load_f32_and_deinterleave_direct, TransferFunction, XYZ_TO_SRGB_D65}; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; use crate::neon::get_neon_gamma_transfer; use crate::neon::math::vcolorq_matrix_f32; +use crate::{load_f32_and_deinterleave_direct, TransferFunction, XYZ_TO_SRGB_D65}; #[inline(always)] unsafe fn neon_oklab_gamma_vld( diff --git a/src/sse/jzazbz_to_image.rs b/src/sse/jzazbz_to_image.rs index 6ae81a3..f4b1bc0 100644 --- a/src/sse/jzazbz_to_image.rs +++ b/src/sse/jzazbz_to_image.rs @@ -12,13 +12,16 @@ use std::arch::x86_64::*; use erydanos::{_mm_cos_ps, _mm_isnan_ps, _mm_mlaf_ps, _mm_pow_ps, _mm_sin_ps}; -use crate::{load_f32_and_deinterleave_direct, store_and_interleave_v3_u8, store_and_interleave_v4_u8, TransferFunction, XYZ_TO_SRGB_D65}; use crate::image::ImageConfiguration; use crate::image_to_jzazbz::JzazbzTarget; use crate::sse::{ _mm_color_matrix_ps, _mm_pow_n_ps, _mm_select_ps, get_sse_gamma_transfer, sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps, sse_interleave_rgb, sse_interleave_rgba, }; +use crate::{ + load_f32_and_deinterleave_direct, store_and_interleave_v3_u8, store_and_interleave_v4_u8, + TransferFunction, XYZ_TO_SRGB_D65, +}; macro_rules! perceptual_quantizer_inverse { ($color: expr) => {{ @@ -51,7 +54,8 @@ unsafe fn sse_jzazbz_vld( let v_scale_alpha = _mm_set1_ps(255f32); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (jz, mut az, mut bz, mut a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration); + let (jz, mut az, mut bz, mut a_f32) = + load_f32_and_deinterleave_direct!(src, image_configuration); if target == JzazbzTarget::JZCZHZ { let cz = az; diff --git a/src/sse/oklab_to_image.rs b/src/sse/oklab_to_image.rs index ebe4d8b..0c9b867 100644 --- a/src/sse/oklab_to_image.rs +++ b/src/sse/oklab_to_image.rs @@ -167,141 +167,29 @@ pub unsafe fn sse_oklab_to_image( - src_ptr_0, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_0, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let src_ptr_1 = offset_src_ptr.add(4 * channels); let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_oklab_vld::( - src_ptr_1, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_1, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels); let (r_row2_, g_row2_, b_row2_, a_row2_) = sse_oklab_vld::( - src_ptr_2, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_2, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels); let (r_row3_, g_row3_, b_row3_, a_row3_) = sse_oklab_vld::( - src_ptr_3, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_3, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let r_row01 = _mm_packus_epi32(r_row0_, r_row1_); @@ -337,71 +225,15 @@ pub unsafe fn sse_oklab_to_image( - src_ptr_0, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_0, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let src_ptr_1 = offset_src_ptr.add(4 * channels); let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_oklab_vld::( - src_ptr_1, - &transfer, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, + src_ptr_1, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, + c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); let r_row01 = _mm_packus_epi32(r_row0_, r_row1_);