From de01f24804eb93aab2887d4f7258d460cf06a9de Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Thu, 1 Aug 2024 20:29:26 +0100 Subject: [PATCH] Improvements --- src/app/src/main.rs | 8 +- src/avx/image_to_oklab.rs | 32 +-- src/avx/linear_to_image.rs | 49 +++- src/avx/math.rs | 3 - src/avx/mod.rs | 2 + src/avx/oklab_to_image.rs | 500 ++++++++++++++++++++++++++++++++++ src/avx/routines.rs | 57 ++++ src/avx/support.rs | 7 + src/avx/to_linear.rs | 34 ++- src/avx/to_sigmoidal.rs | 26 +- src/avx/to_xyz_lab.rs | 35 ++- src/avx/xyz_lab_to_image.rs | 109 ++++++-- src/avx/xyza_laba_to_image.rs | 34 +-- src/neon/image_to_hsv.rs | 7 +- src/neon/jzazbz_to_image.rs | 4 +- src/neon/routines.rs | 25 ++ src/oklab_to_image.rs | 13 + src/sse/jzazbz_to_image.rs | 7 +- src/sse/oklab_to_image.rs | 16 +- src/sse/routines.rs | 32 +++ 20 files changed, 863 insertions(+), 137 deletions(-) create mode 100644 src/avx/oklab_to_image.rs diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 601f4de..8023532 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -66,14 +66,13 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0f32); let src_stride = width * components as u32; let start_time = Instant::now(); - rgb_to_oklch( + rgb_to_lab( src_bytes, src_stride, &mut lab_store, store_stride as u32, width, - height, - TransferFunction::Srgb, + height ); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -101,14 +100,13 @@ fn main() { // } let start_time = Instant::now(); - oklch_to_rgb( + lab_to_srgb( &lab_store, store_stride as u32, &mut dst_slice, src_stride, width, height, - TransferFunction::Srgb, ); let elapsed_time = start_time.elapsed(); diff --git a/src/avx/image_to_oklab.rs b/src/avx/image_to_oklab.rs index bb0aece..ae89574 100644 --- a/src/avx/image_to_oklab.rs +++ b/src/avx/image_to_oklab.rs @@ -119,8 +119,6 @@ pub unsafe fn avx_image_to_oklab(r_low)); + let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); + let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!( r_low_high, g_low_high, b_low_high, &transfer, target, x0, x1, x2, x3, x4, x5, x6, x7, @@ -181,9 +179,9 @@ pub unsafe fn avx_image_to_oklab(r_chan)); + let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan)); + let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan)); let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high)); let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high)); @@ -194,7 +192,7 @@ pub unsafe fn avx_image_to_oklab(r_high)); + let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high)); + let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high)); let (x_high_high, y_high_high, z_high_high) = triple_to_oklab!( r_high_high, @@ -251,7 +249,7 @@ pub unsafe fn avx_image_to_oklab(r_low)); + let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); + let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!( r_low_high, g_low_high, b_low_high, &transfer, target, x0, x1, x2, x3, x4, x5, x6, x7, @@ -317,7 +315,7 @@ pub unsafe fn avx_image_to_oklab(a_low))), u8_scale, ); diff --git a/src/avx/linear_to_image.rs b/src/avx/linear_to_image.rs index ec066f7..01ea78e 100644 --- a/src/avx/linear_to_image.rs +++ b/src/avx/linear_to_image.rs @@ -14,7 +14,10 @@ use crate::avx::gamma_curves::get_avx_gamma_transfer; use crate::avx::routines::avx_vld_f32_and_deinterleave; use crate::avx::{avx2_interleave_rgb, avx2_interleave_rgba_epi8, avx2_pack_s32, avx2_pack_u16}; use crate::image::ImageConfiguration; -use crate::{avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_u8, TransferFunction}; +use crate::{ + avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_u8, + avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_u8, TransferFunction, +}; #[inline(always)] unsafe fn gamma_vld( @@ -113,5 +116,49 @@ pub unsafe fn avx_linear_to_gamma(src_ptr_0, transfer_function); + + let src_ptr_1 = offset_src_ptr.add(8 * channels); + + let (r_row1_, g_row1_, b_row1_, a_row1_) = + gamma_vld::(src_ptr_1, transfer_function); + + let r_row01 = avx2_pack_s32(r_row0_, r_row1_); + let g_row01 = avx2_pack_s32(g_row0_, g_row1_); + let b_row01 = avx2_pack_s32(b_row0_, b_row1_); + + let r_row = avx2_pack_u16(r_row01, zeros); + let g_row = avx2_pack_u16(g_row01, zeros); + let b_row = avx2_pack_u16(b_row01, zeros); + + let dst_ptr = dst.add(dst_offset as usize + cx * channels); + + if USE_ALPHA { + let a_row01 = avx2_pack_s32(a_row0_, a_row1_); + let a_row = avx2_pack_u16(a_row01, zeros); + avx_store_and_interleave_v4_half_u8!( + dst_ptr, + image_configuration, + r_row, + g_row, + b_row, + a_row + ); + } else { + avx_store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); + } + + cx += 16; + } + cx } diff --git a/src/avx/math.rs b/src/avx/math.rs index cdcb66f..4c4ccee 100644 --- a/src/avx/math.rs +++ b/src/avx/math.rs @@ -18,19 +18,16 @@ pub unsafe fn _mm256_cube_ps(x: __m256) -> __m256 { } #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm256_pow_ps(x: __m256, n: __m256) -> __m256 { _mm256_pow_fast_ps(x, n) } #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm256_pow_n_ps(x: __m256, n: f32) -> __m256 { _mm256_pow_fast_ps(x, _mm256_set1_ps(n)) } #[inline(always)] -#[allow(dead_code)] pub(crate) unsafe fn _mm256_fmaf_ps(a: __m256, b: __m256, c: __m256) -> __m256 { _mm256_prefer_fma_ps(c, b, a) } diff --git a/src/avx/mod.rs b/src/avx/mod.rs index 0a8c0d0..516bf59 100644 --- a/src/avx/mod.rs +++ b/src/avx/mod.rs @@ -20,6 +20,7 @@ mod to_xyz_lab; mod utils; mod xyz_lab_to_image; mod xyza_laba_to_image; +mod oklab_to_image; pub use from_sigmoidal::avx_from_sigmoidal_row; pub use image_to_oklab::avx_image_to_oklab; @@ -32,3 +33,4 @@ pub use to_xyz_lab::*; pub use utils::*; pub use xyz_lab_to_image::*; pub use xyza_laba_to_image::*; +pub use oklab_to_image::avx_oklab_to_image; \ No newline at end of file diff --git a/src/avx/oklab_to_image.rs b/src/avx/oklab_to_image.rs new file mode 100644 index 0000000..ccbea0e --- /dev/null +++ b/src/avx/oklab_to_image.rs @@ -0,0 +1,500 @@ +/* + * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use erydanos::{_mm256_cos_ps, _mm256_sin_ps}; + +use crate::{avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8, avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction, XYZ_TO_SRGB_D65}; +use crate::avx::{_mm256_color_matrix_ps, _mm256_cube_ps, avx2_pack_u16, avx2_pack_u32, avx2_interleave_rgba_epi8, avx2_interleave_rgb}; +use crate::avx::gamma_curves::get_avx_gamma_transfer; +use crate::avx::routines::avx_vld_f32_and_deinterleave_direct; +use crate::image::ImageConfiguration; +use crate::image_to_oklab::OklabTarget; + +#[inline(always)] +unsafe fn avx_oklab_vld( + src: *const f32, + transfer: &unsafe fn(__m256) -> __m256, + oklab_target: OklabTarget, + m0: __m256, + m1: __m256, + m2: __m256, + m3: __m256, + m4: __m256, + m5: __m256, + m6: __m256, + m7: __m256, + m8: __m256, + c0: __m256, + c1: __m256, + c2: __m256, + c3: __m256, + c4: __m256, + c5: __m256, + c6: __m256, + c7: __m256, + c8: __m256, + x0: __m256, + x1: __m256, + x2: __m256, + x3: __m256, + x4: __m256, + x5: __m256, + x6: __m256, + x7: __m256, + x8: __m256, +) -> (__m256i, __m256i, __m256i, __m256i) { + let v_scale_alpha = _mm256_set1_ps(255f32); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + + let (l, mut a, mut b, mut a_f32) = avx_vld_f32_and_deinterleave_direct::(src); + + if oklab_target == OklabTarget::OKLCH { + let a0 = _mm256_mul_ps(a, _mm256_cos_ps(b)); + let b0 = _mm256_mul_ps(a, _mm256_sin_ps(b)); + a = a0; + b = b0; + } + + let (mut l_l, mut l_m, mut l_s) = + _mm256_color_matrix_ps(l, a, b, m0, m1, m2, m3, m4, m5, m6, m7, m8); + + l_l = _mm256_cube_ps(l_l); + l_m = _mm256_cube_ps(l_m); + l_s = _mm256_cube_ps(l_s); + + let (x, y, z) = _mm256_color_matrix_ps(l_l, l_m, l_s, c0, c1, c2, c3, c4, c5, c6, c7, c8); + + let (r_l, g_l, b_l) = _mm256_color_matrix_ps(x, y, z, x0, x1, x2, x3, x4, x5, x6, x7, x8); + + let mut r_f32 = transfer(r_l); + let mut g_f32 = transfer(g_l); + let mut b_f32 = transfer(b_l); + + r_f32 = _mm256_mul_ps(r_f32, v_scale_alpha); + g_f32 = _mm256_mul_ps(g_f32, v_scale_alpha); + b_f32 = _mm256_mul_ps(b_f32, v_scale_alpha); + if image_configuration.has_alpha() { + a_f32 = _mm256_mul_ps(a_f32, v_scale_alpha); + } + + if image_configuration.has_alpha() { + ( + _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(r_f32)), + _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(g_f32)), + _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(b_f32)), + _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(a_f32)), + ) + } else { + ( + _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(r_f32)), + _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(g_f32)), + _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(b_f32)), + _mm256_set1_epi32(255), + ) + } +} + +#[inline(always)] +pub unsafe fn avx_oklab_to_image( + start_cx: usize, + src: *const f32, + src_offset: u32, + dst: *mut u8, + dst_offset: u32, + width: u32, + transfer_function: TransferFunction, +) -> usize { + let transfer = get_avx_gamma_transfer(transfer_function); + let target: OklabTarget = TARGET.into(); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let channels = image_configuration.get_channels_count(); + let mut cx = start_cx; + + // Matrix from XYZ + let (x0, x1, x2, x3, x4, x5, x6, x7, x8) = ( + _mm256_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(0).get_unchecked(0)), + _mm256_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(0).get_unchecked(1)), + _mm256_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(0).get_unchecked(2)), + _mm256_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(1).get_unchecked(0)), + _mm256_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(1).get_unchecked(1)), + _mm256_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(1).get_unchecked(2)), + _mm256_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(2).get_unchecked(0)), + _mm256_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(2).get_unchecked(1)), + _mm256_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(2).get_unchecked(2)), + ); + + let (m0, m1, m2, m3, m4, m5, m6, m7, m8) = ( + _mm256_set1_ps(1f32), + _mm256_set1_ps(0.3963377774f32), + _mm256_set1_ps(0.2158037573f32), + _mm256_set1_ps(1f32), + _mm256_set1_ps(-0.1055613458f32), + _mm256_set1_ps(-0.0638541728f32), + _mm256_set1_ps(1f32), + _mm256_set1_ps(-0.0894841775f32), + _mm256_set1_ps(-1.2914855480f32), + ); + + let (c0, c1, c2, c3, c4, c5, c6, c7, c8) = ( + _mm256_set1_ps(4.0767416621f32), + _mm256_set1_ps(-3.3077115913f32), + _mm256_set1_ps(0.2309699292f32), + _mm256_set1_ps(-1.2684380046f32), + _mm256_set1_ps(2.6097574011f32), + _mm256_set1_ps(-0.3413193965f32), + _mm256_set1_ps(-0.0041960863f32), + _mm256_set1_ps(-0.7034186147f32), + _mm256_set1_ps(1.7076147010f32), + ); + + let zeros = _mm256_setzero_si256(); + + while cx + 32 < width as usize { + let offset_src_ptr = + ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); + + let src_ptr_0 = offset_src_ptr; + + let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::( + src_ptr_0, + &transfer, + target, + m0, + m1, + m2, + m3, + m4, + m5, + m6, + m7, + m8, + c0, + c1, + c2, + c3, + c4, + c5, + c6, + c7, + c8, + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + ); + + let src_ptr_1 = offset_src_ptr.add(8 * channels); + + let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::( + src_ptr_1, + &transfer, + target, + m0, + m1, + m2, + m3, + m4, + m5, + m6, + m7, + m8, + c0, + c1, + c2, + c3, + c4, + c5, + c6, + c7, + c8, + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + ); + + let src_ptr_2 = offset_src_ptr.add(8 * 2 * channels); + + let (r_row2_, g_row2_, b_row2_, a_row2_) = avx_oklab_vld::( + src_ptr_2, + &transfer, + target, + m0, + m1, + m2, + m3, + m4, + m5, + m6, + m7, + m8, + c0, + c1, + c2, + c3, + c4, + c5, + c6, + c7, + c8, + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + ); + + let src_ptr_3 = offset_src_ptr.add(8 * 3 * channels); + + let (r_row3_, g_row3_, b_row3_, a_row3_) = avx_oklab_vld::( + src_ptr_3, + &transfer, + target, + m0, + m1, + m2, + m3, + m4, + m5, + m6, + m7, + m8, + c0, + c1, + c2, + c3, + c4, + c5, + c6, + c7, + c8, + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + ); + + let r_row01 = avx2_pack_u32(r_row0_, r_row1_); + let g_row01 = avx2_pack_u32(g_row0_, g_row1_); + let b_row01 = avx2_pack_u32(b_row0_, b_row1_); + + let r_row23 = avx2_pack_u32(r_row2_, r_row3_); + let g_row23 = avx2_pack_u32(g_row2_, g_row3_); + let b_row23 = avx2_pack_u32(b_row2_, b_row3_); + + let r_row = avx2_pack_u16(r_row01, r_row23); + let g_row = avx2_pack_u16(g_row01, g_row23); + let b_row = avx2_pack_u16(b_row01, b_row23); + + let dst_ptr = dst.add(dst_offset as usize + cx * channels); + + if image_configuration.has_alpha() { + let a_row01 = avx2_pack_u32(a_row0_, a_row1_); + let a_row23 = avx2_pack_u32(a_row2_, a_row3_); + let a_row = avx2_pack_u16(a_row01, a_row23); + avx_store_and_interleave_v4_u8!(dst_ptr, image_configuration, r_row, g_row, b_row, a_row); + } else { + avx_store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); + } + + cx += 32; + } + + while cx + 16 < width as usize { + let offset_src_ptr = + ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); + + let src_ptr_0 = offset_src_ptr; + + let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::( + src_ptr_0, + &transfer, + target, + m0, + m1, + m2, + m3, + m4, + m5, + m6, + m7, + m8, + c0, + c1, + c2, + c3, + c4, + c5, + c6, + c7, + c8, + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + ); + + let src_ptr_1 = offset_src_ptr.add(8 * channels); + + let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::( + src_ptr_1, + &transfer, + target, + m0, + m1, + m2, + m3, + m4, + m5, + m6, + m7, + m8, + c0, + c1, + c2, + c3, + c4, + c5, + c6, + c7, + c8, + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + ); + + let r_row01 = avx2_pack_u32(r_row0_, r_row1_); + let g_row01 = avx2_pack_u32(g_row0_, g_row1_); + let b_row01 = avx2_pack_u32(b_row0_, b_row1_); + + let r_row = avx2_pack_u16(r_row01, zeros); + let g_row = avx2_pack_u16(g_row01, zeros); + let b_row = avx2_pack_u16(b_row01, zeros); + + let dst_ptr = dst.add(dst_offset as usize + cx * channels); + + if image_configuration.has_alpha() { + let a_row01 = avx2_pack_u32(a_row0_, a_row1_); + let a_row = avx2_pack_u16(a_row01, zeros); + avx_store_and_interleave_v4_half_u8!( + dst_ptr, + image_configuration, + r_row, + g_row, + b_row, + a_row + ); + } else { + avx_store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); + } + + cx += 16; + } + + while cx + 8 < width as usize { + let offset_src_ptr = + ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); + + let src_ptr_0 = offset_src_ptr; + + let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::( + src_ptr_0, + &transfer, + target, + m0, + m1, + m2, + m3, + m4, + m5, + m6, + m7, + m8, + c0, + c1, + c2, + c3, + c4, + c5, + c6, + c7, + c8, + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + ); + + let r_row01 = avx2_pack_u32(r_row0_, zeros); + let g_row01 = avx2_pack_u32(g_row0_, zeros); + let b_row01 = avx2_pack_u32(b_row0_, zeros); + + let r_row = avx2_pack_u16(r_row01, zeros); + let g_row = avx2_pack_u16(g_row01, zeros); + let b_row = avx2_pack_u16(b_row01, zeros); + + let dst_ptr = dst.add(dst_offset as usize + cx * channels); + + if image_configuration.has_alpha() { + let a_row01 = avx2_pack_u32(a_row0_, zeros); + let a_row = avx2_pack_u16(a_row01, zeros); + avx_store_and_interleave_v4_quarter_u8!( + dst_ptr, + image_configuration, + r_row, + g_row, + b_row, + a_row + ); + } else { + avx_store_and_interleave_v3_quarter_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); + } + + cx += 8; + } + + cx +} diff --git a/src/avx/routines.rs b/src/avx/routines.rs index 9c926e6..8cfc555 100644 --- a/src/avx/routines.rs +++ b/src/avx/routines.rs @@ -188,6 +188,38 @@ pub(crate) unsafe fn avx_vld_f32_and_deinterleave( + ptr: *const f32, +) -> (__m256, __m256, __m256, __m256) { + let (r_f32, g_f32, b_f32, a_f32); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + + let row0 = _mm256_loadu_ps(ptr); + let row1 = _mm256_loadu_ps(ptr.add(8)); + let row2 = _mm256_loadu_ps(ptr.add(16)); + + match image_configuration { + ImageConfiguration::Rgba | ImageConfiguration::Bgra => { + let row3 = _mm256_loadu_ps(ptr.add(24)); + let (v0, v1, v2, v3) = avx2_deinterleave_rgba_ps(row0, row1, row2, row3); + r_f32 = v0; + g_f32 = v1; + b_f32 = v2; + a_f32 = v3; + } + ImageConfiguration::Bgr | ImageConfiguration::Rgb => { + let rgb_pixels = avx2_deinterleave_rgb_ps(row0, row1, row2); + r_f32 = rgb_pixels.0; + g_f32 = rgb_pixels.1; + b_f32 = rgb_pixels.2; + a_f32 = _mm256_set1_ps(1.); + } + } + + (r_f32, g_f32, b_f32, a_f32) +} + #[macro_export] macro_rules! avx_store_and_interleave_u8 { ($ptr: expr, $configuration: expr, $j0: expr, $j1: expr, $j2: expr, $j3: expr) => {{ @@ -294,6 +326,31 @@ macro_rules! avx_store_and_interleave_v4_quarter_u8 { }}; } +#[macro_export] +macro_rules! avx_store_and_interleave_v3_quarter_u8 { + ($ptr: expr, $configuration: expr, $j0: expr, $j1: expr, $j2: expr) => {{ + let row0; + match $configuration { + ImageConfiguration::Rgba => { + (row0, _, _, _) = avx2_interleave_rgba_epi8($j0, $j1, $j2, _mm256_setzero_si256()) + } + ImageConfiguration::Rgb => { + (row0, _, _) = avx2_interleave_rgb($j0, $j1, $j2); + } + ImageConfiguration::Bgr => { + (row0, _, _) = avx2_interleave_rgb($j2, $j1, $j0); + } + ImageConfiguration::Bgra => { + (row0, _, _, _) = avx2_interleave_rgba_epi8($j2, $j1, $j0, _mm256_setzero_si256()) + } + }; + let lo = _mm256_castsi256_si128(row0); + _mm_storeu_si128($ptr as *mut __m128i, lo); + let hi = _mm256_extracti128_si256::<1>(row0); + std::ptr::copy_nonoverlapping(&hi as *const _ as *const u8, $ptr.add(16), 8); + }}; +} + #[macro_export] macro_rules! avx_store_and_interleave_v3_u8 { ($ptr: expr, $configuration: expr, $j0: expr, $j1: expr, $j2: expr) => {{ diff --git a/src/avx/support.rs b/src/avx/support.rs index cf06305..4e8b0a0 100644 --- a/src/avx/support.rs +++ b/src/avx/support.rs @@ -473,3 +473,10 @@ pub unsafe fn avx2_pack_s32(s_1: __m256i, s_2: __m256i) -> __m256i { const MASK: i32 = shuffle(3, 1, 2, 0); return _mm256_permute4x64_epi64::(packed); } + +#[inline(always)] +pub unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i { + let packed = _mm256_packus_epi32(s_1, s_2); + const MASK: i32 = shuffle(3, 1, 2, 0); + return _mm256_permute4x64_epi64::(packed); +} diff --git a/src/avx/to_linear.rs b/src/avx/to_linear.rs index 9e5cbf3..a57c81f 100644 --- a/src/avx/to_linear.rs +++ b/src/avx/to_linear.rs @@ -55,8 +55,6 @@ pub unsafe fn avx_channels_to_linear(r_low)); + let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); + let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); let (x_low_high, y_low_high, z_low_high) = triple_to_linear(r_low_high, g_low_high, b_low_high, &transfer); if USE_ALPHA { let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_low, zeros)), + _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), u8_scale, ); @@ -136,9 +134,9 @@ pub unsafe fn avx_channels_to_linear(r_chan)); + let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan)); + let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan)); let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high)); let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high)); @@ -147,7 +145,7 @@ pub unsafe fn avx_channels_to_linear(a_chan)); if USE_ALPHA { let a_high_low = _mm256_mul_ps( @@ -175,16 +173,16 @@ pub unsafe fn avx_channels_to_linear(r_high)); + let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high)); + let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high)); let (x_high_high, y_high_high, z_high_high) = triple_to_linear(r_high_high, g_high_high, b_high_high, &transfer); if USE_ALPHA { let a_high_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_high, zeros)), + _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_high))), u8_scale, ); let ptr = dst_ptr.add(cx * 4 + 96); @@ -256,16 +254,16 @@ pub unsafe fn avx_channels_to_linear(r_low)); + let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); + let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); let (x_low_high, y_low_high, z_low_high) = triple_to_linear(r_low_high, g_low_high, b_low_high, &transfer); if USE_ALPHA { let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_low, zeros)), + _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), u8_scale, ); diff --git a/src/avx/to_sigmoidal.rs b/src/avx/to_sigmoidal.rs index 95a2d99..f83d7d1 100644 --- a/src/avx/to_sigmoidal.rs +++ b/src/avx/to_sigmoidal.rs @@ -38,8 +38,6 @@ pub unsafe fn avx_image_to_sigmoidal_row< let dst_ptr = (dst as *mut u8) as *mut f32; - let zeros = _mm256_setzero_si256(); - while cx + 32 < width as usize { let src_ptr = src.add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = @@ -85,16 +83,16 @@ pub unsafe fn avx_image_to_sigmoidal_row< ); } - let r_low_high = _mm256_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm256_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm256_unpackhi_epi16(b_low, zeros); + let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); + let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); + let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); let (x_low_high, y_low_high, z_low_high) = avx_rgb_to_sigmoidal(r_low_high, g_low_high, b_low_high); if USE_ALPHA { let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_low, zeros)), + _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), u8_scale, ); @@ -118,9 +116,9 @@ pub unsafe fn avx_image_to_sigmoidal_row< ); } - let r_high = _mm256_unpackhi_epi8(r_chan, zeros); - let g_high = _mm256_unpackhi_epi8(g_chan, zeros); - let b_high = _mm256_unpackhi_epi8(b_chan, zeros); + let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan)); + let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan)); + let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan)); let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high)); let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high)); @@ -129,7 +127,7 @@ pub unsafe fn avx_image_to_sigmoidal_row< let (x_high_low, y_high_low, z_high_low) = avx_rgb_to_sigmoidal(r_high_low, g_high_low, b_high_low); - let a_high = _mm256_unpackhi_epi8(a_chan, zeros); + let a_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(a_chan)); if USE_ALPHA { let a_high_low = _mm256_mul_ps( @@ -157,16 +155,16 @@ pub unsafe fn avx_image_to_sigmoidal_row< ); } - let r_high_high = _mm256_unpackhi_epi16(r_high, zeros); - let g_high_high = _mm256_unpackhi_epi16(g_high, zeros); - let b_high_high = _mm256_unpackhi_epi16(b_high, zeros); + let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high)); + let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high)); + let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high)); let (x_high_high, y_high_high, z_high_high) = avx_rgb_to_sigmoidal(r_high_high, g_high_high, b_high_high); if USE_ALPHA { let a_high_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_high, zeros)), + _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_high))), u8_scale, ); diff --git a/src/avx/to_xyz_lab.rs b/src/avx/to_xyz_lab.rs index c7f6d32..f23a036 100644 --- a/src/avx/to_xyz_lab.rs +++ b/src/avx/to_xyz_lab.rs @@ -62,8 +62,6 @@ pub unsafe fn avx2_image_to_xyz_lab< let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; - let zeros = _mm256_setzero_si256(); - while cx + 32 < width as usize { let src_ptr = src.add(src_offset + cx * channels); let (r_chan, g_chan, b_chan, a_chan) = @@ -106,9 +104,9 @@ pub unsafe fn avx2_image_to_xyz_lab< let write_dst_ptr = dst_ptr.add(cx * 3); avx_store_and_interleave_v3_direct_f32!(write_dst_ptr, x_low_low, y_low_low, z_low_low); - let r_low_high = _mm256_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm256_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm256_unpackhi_epi16(b_low, zeros); + let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); + let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); + let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); let (mut x_low_high, mut y_low_high, mut z_low_high) = avx2_triple_to_xyz( r_low_high, g_low_high, b_low_high, cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9, @@ -140,9 +138,9 @@ pub unsafe fn avx2_image_to_xyz_lab< let ptr2 = write_dst_ptr.add(8 * 3); avx_store_and_interleave_v3_direct_f32!(ptr2, x_low_high, y_low_high, z_low_high); - let r_high = _mm256_unpackhi_epi8(r_chan, zeros); - let g_high = _mm256_unpackhi_epi8(g_chan, zeros); - let b_high = _mm256_unpackhi_epi8(b_chan, zeros); + let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan)); + let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan)); + let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan)); let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high)); let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high)); @@ -178,9 +176,9 @@ pub unsafe fn avx2_image_to_xyz_lab< let ptr3 = write_dst_ptr.add(8 * 3 * 2); avx_store_and_interleave_v3_direct_f32!(ptr3, x_high_low, y_high_low, z_high_low); - let r_high_high = _mm256_unpackhi_epi16(r_high, zeros); - let g_high_high = _mm256_unpackhi_epi16(g_high, zeros); - let b_high_high = _mm256_unpackhi_epi16(b_high, zeros); + let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high)); + let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high)); + let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high)); let (mut x_high_high, mut y_high_high, mut z_high_high) = avx2_triple_to_xyz( r_high_high, @@ -238,13 +236,13 @@ pub unsafe fn avx2_image_to_xyz_lab< _mm256_storeu_ps(a_ptr.add(cx), a_low_low); let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_low, zeros)), + _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), u8_scale, ); _mm256_storeu_ps(a_ptr.add(cx + 8), a_low_high); - let a_high = _mm256_unpackhi_epi8(a_chan, zeros); + let a_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(a_chan)); let a_high_low = _mm256_mul_ps( _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_high))), @@ -254,7 +252,7 @@ pub unsafe fn avx2_image_to_xyz_lab< _mm256_storeu_ps(a_ptr.add(cx + 8 * 2), a_high_low); let a_high_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_high, zeros)), + _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_high))), u8_scale, ); @@ -306,10 +304,9 @@ pub unsafe fn avx2_image_to_xyz_lab< let write_dst_ptr = dst_ptr.add(cx * 3); avx_store_and_interleave_v3_direct_f32!(write_dst_ptr, x_low_low, y_low_low, z_low_low); - let r_low_high = _mm256_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm256_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm256_unpackhi_epi16(b_low, zeros); - + let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); + let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); + let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); let (mut x_low_high, mut y_low_high, mut z_low_high) = avx2_triple_to_xyz( r_low_high, g_low_high, b_low_high, cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9, &transfer, @@ -355,7 +352,7 @@ pub unsafe fn avx2_image_to_xyz_lab< _mm256_storeu_ps(a_ptr.add(cx), a_low_low); let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_low, zeros)), + _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), u8_scale, ); diff --git a/src/avx/xyz_lab_to_image.rs b/src/avx/xyz_lab_to_image.rs index 514d7eb..a048768 100644 --- a/src/avx/xyz_lab_to_image.rs +++ b/src/avx/xyz_lab_to_image.rs @@ -5,19 +5,24 @@ * // license that can be found in the LICENSE file. */ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + use crate::avx::cie::{avx_lab_to_xyz, avx_lch_to_xyz, avx_luv_to_xyz}; use crate::avx::gamma_curves::get_avx_gamma_transfer; use crate::avx::{ _mm256_color_matrix_ps, avx2_deinterleave_rgb_ps, avx2_interleave_rgb, - avx2_interleave_rgba_epi8, avx2_pack_s32, avx2_pack_u16, + avx2_interleave_rgba_epi8, avx2_pack_u16, avx2_pack_u32, }; use crate::image::ImageConfiguration; use crate::xyz_target::XyzTarget; -use crate::{avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction}; -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; +use crate::{ + avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8, + avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8, + avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction, +}; #[inline(always)] unsafe fn avx_xyz_lab_vld< @@ -130,8 +135,6 @@ pub unsafe fn avx_xyz_to_channels< let color_rescale = _mm256_set1_ps(255f32); - let zeros = _mm256_setzero_si256(); - while cx + 32 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); @@ -203,13 +206,13 @@ pub unsafe fn avx_xyz_to_channels< c9, ); - let r_row01 = avx2_pack_s32(r_row0_, r_row1_); - let g_row01 = avx2_pack_s32(g_row0_, g_row1_); - let b_row01 = avx2_pack_s32(b_row0_, b_row1_); + let r_row01 = avx2_pack_u32(r_row0_, r_row1_); + let g_row01 = avx2_pack_u32(g_row0_, g_row1_); + let b_row01 = avx2_pack_u32(b_row0_, b_row1_); - let r_row23 = avx2_pack_s32(r_row2_, r_row3_); - let g_row23 = avx2_pack_s32(g_row2_, g_row3_); - let b_row23 = avx2_pack_s32(b_row2_, b_row3_); + let r_row23 = avx2_pack_u32(r_row2_, r_row3_); + let g_row23 = avx2_pack_u32(g_row2_, g_row3_); + let b_row23 = avx2_pack_u32(b_row2_, b_row3_); let r_row = avx2_pack_u16(r_row01, r_row23); let g_row = avx2_pack_u16(g_row01, g_row23); @@ -243,8 +246,8 @@ pub unsafe fn avx_xyz_to_channels< color_rescale, ))); - let a_row01 = avx2_pack_s32(a_row0_, a_row1_); - let a_row23 = avx2_pack_s32(a_row2_, a_row3_); + let a_row01 = avx2_pack_u32(a_row0_, a_row1_); + let a_row23 = avx2_pack_u32(a_row2_, a_row3_); let a_row = avx2_pack_u16(a_row01, a_row23); avx_store_and_interleave_v4_u8!( dst_ptr, @@ -261,6 +264,8 @@ pub unsafe fn avx_xyz_to_channels< cx += 32; } + let zeros = _mm256_setzero_si256(); + while cx + 16 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); @@ -298,9 +303,9 @@ pub unsafe fn avx_xyz_to_channels< c9, ); - let r_row01 = avx2_pack_s32(r_row0_, r_row1_); - let g_row01 = avx2_pack_s32(g_row0_, g_row1_); - let b_row01 = avx2_pack_s32(b_row0_, b_row1_); + let r_row01 = avx2_pack_u32(r_row0_, r_row1_); + let g_row01 = avx2_pack_u32(g_row0_, g_row1_); + let b_row01 = avx2_pack_u32(b_row0_, b_row1_); let r_row = avx2_pack_u16(r_row01, zeros); let g_row = avx2_pack_u16(g_row01, zeros); @@ -322,9 +327,8 @@ pub unsafe fn avx_xyz_to_channels< color_rescale, ))); - let a_row01 = avx2_pack_s32(a_row0_, a_row1_); - let a_row23 = avx2_pack_s32(zeros, zeros); - let a_row = avx2_pack_u16(a_row01, a_row23); + let a_row01 = avx2_pack_u32(a_row0_, a_row1_); + let a_row = avx2_pack_u16(a_row01, zeros); avx_store_and_interleave_v4_half_u8!( dst_ptr, image_configuration, @@ -340,5 +344,66 @@ pub unsafe fn avx_xyz_to_channels< cx += 16; } + while cx + 8 < width as usize { + let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); + + let src_ptr_0 = offset_src_ptr; + + let (r_row0_, g_row0_, b_row0_) = + avx_xyz_lab_vld::( + src_ptr_0, + transfer_function, + c1, + c2, + c3, + c4, + c5, + c6, + c7, + c8, + c9, + ); + + let r_row01 = avx2_pack_u32(r_row0_, zeros); + let g_row01 = avx2_pack_u32(g_row0_, zeros); + let b_row01 = avx2_pack_u32(b_row0_, zeros); + + let r_row = avx2_pack_u16(r_row01, zeros); + let g_row = avx2_pack_u16(g_row01, zeros); + let b_row = avx2_pack_u16(b_row01, zeros); + + let dst_ptr = dst.add(dst_offset + cx * channels); + + if USE_ALPHA { + let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); + let a_low_0_f = _mm256_loadu_ps(offset_a_src_ptr); + let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps( + a_low_0_f, + color_rescale, + ))); + + let a_row01 = avx2_pack_u32(a_row0_, zeros); + let a_row = avx2_pack_u16(a_row01, zeros); + avx_store_and_interleave_v4_quarter_u8!( + dst_ptr, + image_configuration, + r_row, + g_row, + b_row, + a_row + ); + } else { + avx_store_and_interleave_v3_quarter_u8!( + dst_ptr, + image_configuration, + r_row, + g_row, + b_row + ); + } + + cx += 8; + } + cx } diff --git a/src/avx/xyza_laba_to_image.rs b/src/avx/xyza_laba_to_image.rs index 9c91bb8..e58d700 100644 --- a/src/avx/xyza_laba_to_image.rs +++ b/src/avx/xyza_laba_to_image.rs @@ -14,7 +14,7 @@ use std::arch::x86_64::*; use crate::avx::cie::{avx_lab_to_xyz, avx_lch_to_xyz, avx_luv_to_xyz}; use crate::avx::gamma_curves::get_avx_gamma_transfer; use crate::avx::{ - _mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8, avx2_pack_s32, + _mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8, avx2_pack_u32, avx2_pack_u16, }; use crate::image::ImageConfiguration; @@ -193,15 +193,15 @@ pub unsafe fn avx_xyza_to_image {{ @@ -44,7 +44,7 @@ unsafe fn neon_jzazbz_gamma_vld( let transfer = get_neon_gamma_transfer(transfer_function); let v_scale_alpha = vdupq_n_f32(255f32); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (jz, mut az, mut bz, mut a_f32) = load_f32_and_deinterleave!(src, image_configuration); + let (jz, mut az, mut bz, mut a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration); if target == JzazbzTarget::JZCZHZ { let cz = az; diff --git a/src/neon/routines.rs b/src/neon/routines.rs index 22305a5..a19cfab 100644 --- a/src/neon/routines.rs +++ b/src/neon/routines.rs @@ -121,3 +121,28 @@ macro_rules! load_f32_and_deinterleave { (r_f32, g_f32, b_f32, a_f32) }}; } + +#[macro_export] +macro_rules! load_f32_and_deinterleave_direct { + ($ptr: expr, $image_configuration: expr) => {{ + let d_alpha = vdupq_n_f32(1f32); + let (r_f32, g_f32, b_f32, a_f32); + match $image_configuration { + ImageConfiguration::Rgba | ImageConfiguration::Bgra => { + let rgba_pixels = vld4q_f32($ptr); + r_f32 = rgba_pixels.0; + g_f32 = rgba_pixels.1; + b_f32 = rgba_pixels.2; + a_f32 = rgba_pixels.3; + } + ImageConfiguration::Bgr | ImageConfiguration::Rgb => { + let rgb_pixels = vld3q_f32($ptr); + r_f32 = rgb_pixels.0; + g_f32 = rgb_pixels.1; + b_f32 = rgb_pixels.2; + a_f32 = d_alpha; + } + } + (r_f32, g_f32, b_f32, a_f32) + }}; +} diff --git a/src/oklab_to_image.rs b/src/oklab_to_image.rs index e2e8b83..55a93f7 100644 --- a/src/oklab_to_image.rs +++ b/src/oklab_to_image.rs @@ -4,6 +4,11 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" +))] +use crate::avx::avx_oklab_to_image; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; #[cfg(all( @@ -46,6 +51,14 @@ fn oklab_to_image( _wide_row_handle = Some(sse_oklab_to_image::); } + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" + ))] + if is_x86_feature_detected!("avx2") { + _wide_row_handle = Some(avx_oklab_to_image::); + } + #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" diff --git a/src/sse/jzazbz_to_image.rs b/src/sse/jzazbz_to_image.rs index 75cc68a..6ae81a3 100644 --- a/src/sse/jzazbz_to_image.rs +++ b/src/sse/jzazbz_to_image.rs @@ -12,16 +12,13 @@ use std::arch::x86_64::*; use erydanos::{_mm_cos_ps, _mm_isnan_ps, _mm_mlaf_ps, _mm_pow_ps, _mm_sin_ps}; +use crate::{load_f32_and_deinterleave_direct, store_and_interleave_v3_u8, store_and_interleave_v4_u8, TransferFunction, XYZ_TO_SRGB_D65}; use crate::image::ImageConfiguration; use crate::image_to_jzazbz::JzazbzTarget; use crate::sse::{ _mm_color_matrix_ps, _mm_pow_n_ps, _mm_select_ps, get_sse_gamma_transfer, sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps, sse_interleave_rgb, sse_interleave_rgba, }; -use crate::{ - load_f32_and_deinterleave, store_and_interleave_v3_u8, store_and_interleave_v4_u8, - TransferFunction, XYZ_TO_SRGB_D65, -}; macro_rules! perceptual_quantizer_inverse { ($color: expr) => {{ @@ -54,7 +51,7 @@ unsafe fn sse_jzazbz_vld( let v_scale_alpha = _mm_set1_ps(255f32); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (jz, mut az, mut bz, mut a_f32) = load_f32_and_deinterleave!(src, image_configuration); + let (jz, mut az, mut bz, mut a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration); if target == JzazbzTarget::JZCZHZ { let cz = az; diff --git a/src/sse/oklab_to_image.rs b/src/sse/oklab_to_image.rs index 01fed74..ebe4d8b 100644 --- a/src/sse/oklab_to_image.rs +++ b/src/sse/oklab_to_image.rs @@ -23,7 +23,7 @@ use std::arch::x86_64::*; #[inline(always)] unsafe fn sse_oklab_vld( src: *const f32, - transfer_function: TransferFunction, + transfer: &unsafe fn(__m128) -> __m128, oklab_target: OklabTarget, m0: __m128, m1: __m128, @@ -53,7 +53,6 @@ unsafe fn sse_oklab_vld( x7: __m128, x8: __m128, ) -> (__m128i, __m128i, __m128i, __m128i) { - let transfer = get_sse_gamma_transfer(transfer_function); let v_scale_alpha = _mm_set1_ps(255f32); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -116,6 +115,7 @@ pub unsafe fn sse_oklab_to_image usize { + let transfer = get_sse_gamma_transfer(transfer_function); let target: OklabTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); let channels = image_configuration.get_channels_count(); @@ -168,7 +168,7 @@ pub unsafe fn sse_oklab_to_image( src_ptr_0, - transfer_function, + &transfer, target, m0, m1, @@ -203,7 +203,7 @@ pub unsafe fn sse_oklab_to_image( src_ptr_1, - transfer_function, + &transfer, target, m0, m1, @@ -238,7 +238,7 @@ pub unsafe fn sse_oklab_to_image( src_ptr_2, - transfer_function, + &transfer, target, m0, m1, @@ -273,7 +273,7 @@ pub unsafe fn sse_oklab_to_image( src_ptr_3, - transfer_function, + &transfer, target, m0, m1, @@ -338,7 +338,7 @@ pub unsafe fn sse_oklab_to_image( src_ptr_0, - transfer_function, + &transfer, target, m0, m1, @@ -373,7 +373,7 @@ pub unsafe fn sse_oklab_to_image( src_ptr_1, - transfer_function, + &transfer, target, m0, m1, diff --git a/src/sse/routines.rs b/src/sse/routines.rs index 6229dcc..e4e0c14 100644 --- a/src/sse/routines.rs +++ b/src/sse/routines.rs @@ -130,6 +130,38 @@ macro_rules! load_f32_and_deinterleave { }}; } +#[macro_export] +macro_rules! load_f32_and_deinterleave_direct { + ($ptr: expr, $image_configuration: expr) => {{ + let (r_f32, g_f32, b_f32, a_f32); + + let row0 = _mm_loadu_ps($ptr); + let row1 = _mm_loadu_ps($ptr.add(4)); + let row2 = _mm_loadu_ps($ptr.add(8)); + + match $image_configuration { + ImageConfiguration::Rgba | ImageConfiguration::Bgra => { + let row3 = _mm_loadu_ps($ptr.add(12)); + let (v0, v1, v2, v3) = sse_deinterleave_rgba_ps(row0, row1, row2, row3); + r_f32 = v0; + g_f32 = v1; + b_f32 = v2; + a_f32 = v3; + } + ImageConfiguration::Bgr | ImageConfiguration::Rgb => { + let d_alpha = _mm_set1_ps(1f32); + let rgb_pixels = sse_deinterleave_rgb_ps(row0, row1, row2); + r_f32 = rgb_pixels.0; + g_f32 = rgb_pixels.1; + b_f32 = rgb_pixels.2; + a_f32 = d_alpha; + } + } + + (r_f32, g_f32, b_f32, a_f32) + }}; +} + #[macro_export] macro_rules! store_and_interleave_v3_direct_f32 { ($ptr: expr, $j0: expr, $j1: expr, $j2: expr) => {{