From f3ee41068187c9fa19d6dcb9a870742ede148e6f Mon Sep 17 00:00:00 2001 From: awxkee Date: Thu, 10 Oct 2024 21:36:03 +0100 Subject: [PATCH] Big reworking with speed increasing --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/app/src/main.rs | 10 +- src/avx/cie.rs | 21 +- src/avx/image_to_oklab.rs | 392 ++------------------------------ src/avx/linear_to_image.rs | 156 ------------- src/avx/mod.rs | 4 - src/avx/oklab_to_image.rs | 312 ++----------------------- src/avx/support.rs | 16 +- src/avx/to_linear.rs | 340 --------------------------- src/avx/to_xyz_lab.rs | 404 +++------------------------------ src/avx/xyz_lab_to_image.rs | 278 ++++------------------- src/avx/xyza_laba_to_image.rs | 217 ++++-------------- src/image_to_linear.rs | 66 +----- src/image_to_linear_u8.rs | 64 +----- src/image_to_oklab.rs | 157 ++++++------- src/image_to_xyz_lab.rs | 359 ++++++++++++++++++----------- src/image_xyza_laba.rs | 197 ++++++++-------- src/lab.rs | 9 +- src/linear_to_image.rs | 161 +++++-------- src/linear_to_image_u8.rs | 66 +----- src/luv.rs | 39 +++- src/neon/cie.rs | 18 +- src/neon/image_to_oklab.rs | 320 +------------------------- src/neon/linear_to_image.rs | 223 ------------------ src/neon/mod.rs | 6 - src/neon/oklab_to_image.rs | 315 ++----------------------- src/neon/to_linear.rs | 312 ------------------------- src/neon/to_linear_u8.rs | 256 --------------------- src/neon/to_xyz_lab.rs | 376 +----------------------------- src/neon/to_xyza_laba.rs | 358 +---------------------------- src/neon/xyz_lab_to_image.rs | 298 +----------------------- src/neon/xyza_laba_to_image.rs | 213 +---------------- src/oklab.rs | 16 +- src/oklab_to_image.rs | 182 ++++++++------- src/oklch.rs | 19 +- src/rgb.rs | 19 ++ src/sse/cie.rs | 34 +-- src/sse/image_to_linear_u8.rs | 237 ------------------- src/sse/image_to_oklab.rs | 296 ++---------------------- src/sse/linear_to_image.rs | 167 -------------- src/sse/mod.rs | 10 +- src/sse/oklab_to_image.rs | 263 ++------------------- src/sse/to_linear.rs | 264 --------------------- src/sse/to_xyz_lab.rs | 327 +------------------------- src/sse/to_xyza_laba.rs | 327 +------------------------- src/sse/xyz_lab_to_image.rs | 272 ++-------------------- src/sse/xyza_laba_to_image.rs | 187 ++------------- src/xyz.rs | 2 +- src/xyz_lab_to_image.rs | 324 +++++++++++++++++--------- src/xyza_laba_to_image.rs | 191 ++++++++-------- 51 files changed, 1311 insertions(+), 7793 deletions(-) delete mode 100644 src/avx/linear_to_image.rs delete mode 100644 src/avx/to_linear.rs delete mode 100644 src/neon/linear_to_image.rs delete mode 100644 src/neon/to_linear.rs delete mode 100644 src/neon/to_linear_u8.rs delete mode 100644 src/sse/image_to_linear_u8.rs delete mode 100644 src/sse/linear_to_image.rs delete mode 100644 src/sse/to_linear.rs diff --git a/Cargo.lock b/Cargo.lock index a9c1c58..9a7e897 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,7 +169,7 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorutils-rs" -version = "0.6.1" +version = "0.6.2" dependencies = [ "erydanos", "half", diff --git a/Cargo.toml b/Cargo.toml index b3e9b15..756da70 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] } [package] name = "colorutils-rs" -version = "0.6.1" +version = "0.7.0" edition = "2021" description = "High performance utilities for color format handling and conversion." readme = "README.md" diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 1fff3d1..0b52aab 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -33,7 +33,7 @@ fn main() { // let restored = lalphabeta.to_rgb(TransferFunction::Srgb); // println!("Restored RGB {:?}", restored); - let img = ImageReader::open("./assets/asset.jpg") + let img = ImageReader::open("./assets/op_fhd.jpg") .unwrap() .decode() .unwrap(); @@ -41,7 +41,7 @@ fn main() { println!("dimensions {:?}", img.dimensions()); println!("{:?}", img.color()); - // let img = img.to_rgba8(); + let img = img.to_rgb8(); let mut src_bytes = img.as_bytes(); let width = dimensions.0; let height = dimensions.1; @@ -68,13 +68,14 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0.); let src_stride = width * components as u32; let start_time = Instant::now(); - rgb_to_oklch( + rgb_to_jzazbz( src_bytes, src_stride, &mut lab_store, store_stride as u32, width, height, + 200., TransferFunction::Srgb, ); let elapsed_time = start_time.elapsed(); @@ -103,13 +104,14 @@ fn main() { // } let start_time = Instant::now(); - oklch_to_rgb( + jzazbz_to_rgb( &lab_store, store_stride as u32, &mut dst_slice, src_stride, width, height, + 200., TransferFunction::Srgb, ); diff --git a/src/avx/cie.rs b/src/avx/cie.rs index 0b61fa0..2977a36 100644 --- a/src/avx/cie.rs +++ b/src/avx/cie.rs @@ -6,13 +6,11 @@ */ use crate::avx::_mm256_cube_ps; -use crate::avx::gamma_curves::perform_avx2_linear_transfer; use crate::avx::math::*; use crate::luv::{ LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y, LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME, LUV_WHITE_V_PRIME, }; -use crate::TransferFunction; use erydanos::{ _mm256_atan2_ps, _mm256_cbrt_ps, _mm256_cos_ps, _mm256_hypot_ps, _mm256_prefer_fma_ps, _mm256_select_ps, _mm256_sin_ps, @@ -103,9 +101,9 @@ pub(crate) unsafe fn avx_lch_to_xyz(l: __m256, c: __m256, h: __m256) -> (__m256, #[inline(always)] pub(crate) unsafe fn avx2_triple_to_xyz( - r: __m256i, - g: __m256i, - b: __m256i, + r: __m256, + g: __m256, + b: __m256, c1: __m256, c2: __m256, c3: __m256, @@ -115,19 +113,8 @@ pub(crate) unsafe fn avx2_triple_to_xyz( c7: __m256, c8: __m256, c9: __m256, - transfer_function: TransferFunction, ) -> (__m256, __m256, __m256) { - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - let r_f = _mm256_mul_ps(_mm256_cvtepi32_ps(r), u8_scale); - let g_f = _mm256_mul_ps(_mm256_cvtepi32_ps(g), u8_scale); - let b_f = _mm256_mul_ps(_mm256_cvtepi32_ps(b), u8_scale); - let r_linear = perform_avx2_linear_transfer(transfer_function, r_f); - let g_linear = perform_avx2_linear_transfer(transfer_function, g_f); - let b_linear = perform_avx2_linear_transfer(transfer_function, b_f); - - let (x, y, z) = _mm256_color_matrix_ps( - r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9, - ); + let (x, y, z) = _mm256_color_matrix_ps(r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9); (x, y, z) } diff --git a/src/avx/image_to_oklab.rs b/src/avx/image_to_oklab.rs index a3c59d9..3dad138 100644 --- a/src/avx/image_to_oklab.rs +++ b/src/avx/image_to_oklab.rs @@ -4,17 +4,13 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ -use crate::avx::gamma_curves::perform_avx2_linear_transfer; -use crate::avx::routines::{ - avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half, - avx_vld_u8_and_deinterleave_quarter, -}; +use crate::avx::routines::avx_vld_f32_and_deinterleave; use crate::avx::{_mm256_color_matrix_ps, avx2_interleave_rgb_ps, avx2_interleave_rgba_ps}; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; use crate::{ - avx_store_and_interleave_v3_direct_f32, avx_store_and_interleave_v4_direct_f32, - TransferFunction, + avx_store_and_interleave_v3_direct_f32, avx_store_and_interleave_v4_direct_f32 + , }; use erydanos::{_mm256_atan2_ps, _mm256_cbrt_fast_ps, _mm256_hypot_fast_ps}; #[cfg(target_arch = "x86")] @@ -23,22 +19,12 @@ use std::arch::x86::*; use std::arch::x86_64::*; macro_rules! triple_to_oklab { - ($r: expr, $g: expr, $b: expr, $transfer: expr, $target: expr, + ($r: expr, $g: expr, $b: expr, $target: expr, $c0:expr, $c1:expr, $c2: expr, $c3: expr, $c4:expr, $c5: expr, $c6:expr, $c7: expr, $c8: expr, $m0: expr, $m1: expr, $m2: expr, $m3: expr, $m4: expr, $m5: expr, $m6: expr, $m7: expr, $m8: expr ) => {{ - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - let r_f = _mm256_mul_ps(_mm256_cvtepi32_ps($r), u8_scale); - let g_f = _mm256_mul_ps(_mm256_cvtepi32_ps($g), u8_scale); - let b_f = _mm256_mul_ps(_mm256_cvtepi32_ps($b), u8_scale); - - let r_linear = perform_avx2_linear_transfer($transfer, r_f); - let g_linear = perform_avx2_linear_transfer($transfer, g_f); - let b_linear = perform_avx2_linear_transfer($transfer, b_f); - - let (l_l, l_m, l_s) = _mm256_color_matrix_ps( - r_linear, g_linear, b_linear, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8, - ); + let (l_l, l_m, l_s) = + _mm256_color_matrix_ps($r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8); let l_ = _mm256_cbrt_fast_ps(l_l); let m_ = _mm256_cbrt_fast_ps(l_m); @@ -61,12 +47,9 @@ macro_rules! triple_to_oklab { #[target_feature(enable = "avx2")] pub unsafe fn avx_image_to_oklab( start_cx: usize, - src: *const u8, - src_offset: usize, width: u32, dst: *mut f32, dst_offset: usize, - transfer_function: TransferFunction, ) -> usize { let target: OklabTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -99,369 +82,26 @@ pub unsafe fn avx_image_to_oklab(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); - - let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!( - r_low_low, - g_low_low, - b_low_low, - transfer_function, - target, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8 - ); - - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - if image_configuration.has_alpha() { - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - let ptr = dst_ptr.add(cx * 4); - avx_store_and_interleave_v4_direct_f32!( - ptr, x_low_low, y_low_low, z_low_low, a_low_low - ); - } else { - let ptr = dst_ptr.add(cx * 3); - avx_store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low); - } - - let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); - let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); - let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); - - let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!( - r_low_high, - g_low_high, - b_low_high, - transfer_function, - target, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8 - ); - - if image_configuration.has_alpha() { - let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4 + 32); - avx_store_and_interleave_v4_direct_f32!( - ptr, x_low_high, y_low_high, z_low_high, a_low_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 8 * 3); - avx_store_and_interleave_v3_direct_f32!(ptr, x_low_high, y_low_high, z_low_high); - } - - let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan)); - let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan)); - let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan)); - - let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high)); - let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high)); - let b_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_high)); - - let (x_high_low, y_high_low, z_high_low) = triple_to_oklab!( - r_high_low, - g_high_low, - b_high_low, - transfer_function, - target, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8 - ); - - let a_high = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - if image_configuration.has_alpha() { - let a_high_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_high))), - u8_scale, - ); - let ptr = dst_ptr.add(cx * 4 + 8 * 4 * 2); - avx_store_and_interleave_v4_direct_f32!( - ptr, x_high_low, y_high_low, z_high_low, a_high_low - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 8 * 3 * 2); - avx_store_and_interleave_v3_direct_f32!(ptr, x_high_low, y_high_low, z_high_low); - } - - let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high)); - let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high)); - let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high)); - - let (x_high_high, y_high_high, z_high_high) = triple_to_oklab!( - r_high_high, - g_high_high, - b_high_high, - transfer_function, - target, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8 - ); - - if image_configuration.has_alpha() { - let a_high_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(a_high))), - u8_scale, - ); - let ptr = dst_ptr.add(cx * 4 + 8 * 4 * 3); - avx_store_and_interleave_v4_direct_f32!( - ptr, - x_high_high, - y_high_high, - z_high_high, - a_high_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 8 * 3 * 3); - avx_store_and_interleave_v3_direct_f32!(ptr, x_high_high, y_high_high, z_high_high); - } - - cx += 32; - } - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave_half::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); - - let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!( - r_low_low, - g_low_low, - b_low_low, - transfer_function, - target, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8 - ); - - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - if image_configuration.has_alpha() { - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - let ptr = dst_ptr.add(cx * 4); - avx_store_and_interleave_v4_direct_f32!( - ptr, x_low_low, y_low_low, z_low_low, a_low_low - ); - } else { - let ptr = dst_ptr.add(cx * 3); - avx_store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low); - } - - let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); - let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); - let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); - - let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!( - r_low_high, - g_low_high, - b_low_high, - transfer_function, - target, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8 - ); - - if image_configuration.has_alpha() { - let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4 + 32); - avx_store_and_interleave_v4_direct_f32!( - ptr, x_low_high, y_low_high, z_low_high, a_low_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 8 * 3); - avx_store_and_interleave_v3_direct_f32!(ptr, x_low_high, y_low_high, z_low_high); - } - - cx += 16; - } - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); + let in_place_ptr = dst_ptr.add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave_quarter::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); + avx_vld_f32_and_deinterleave::(in_place_ptr); let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!( - r_low_low, - g_low_low, - b_low_low, - transfer_function, - target, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8 + r_chan, g_chan, b_chan, target, c0, c1, c2, c3, c4, c5, c6, c7, c8, m0, m1, m2, m3, m4, + m5, m6, m7, m8 ); - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - if image_configuration.has_alpha() { - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - let ptr = dst_ptr.add(cx * 4); avx_store_and_interleave_v4_direct_f32!( - ptr, x_low_low, y_low_low, z_low_low, a_low_low + in_place_ptr, + x_low_low, + y_low_low, + z_low_low, + a_chan ); } else { - let ptr = dst_ptr.add(cx * 3); - avx_store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low); + avx_store_and_interleave_v3_direct_f32!(in_place_ptr, x_low_low, y_low_low, z_low_low); } cx += 8; diff --git a/src/avx/linear_to_image.rs b/src/avx/linear_to_image.rs deleted file mode 100644 index b4fa328..0000000 --- a/src/avx/linear_to_image.rs +++ /dev/null @@ -1,156 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -use crate::avx::gamma_curves::perform_avx_gamma_transfer; -use crate::avx::routines::avx_vld_f32_and_deinterleave; -use crate::avx::{ - _mm256_packus_four_epi32, avx2_interleave_rgb, avx2_interleave_rgba_epi8, avx2_pack_u16, - avx2_pack_u32, -}; -use crate::image::ImageConfiguration; -use crate::{ - avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_u8, - avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_u8, TransferFunction, -}; - -#[inline(always)] -unsafe fn gamma_vld( - src: *const f32, - transfer_function: TransferFunction, -) -> (__m256i, __m256i, __m256i, __m256i) { - let v_scale_alpha = _mm256_set1_ps(255f32); - let (mut r_f32, mut g_f32, mut b_f32, mut a_f32) = - avx_vld_f32_and_deinterleave::(src); - r_f32 = perform_avx_gamma_transfer(transfer_function, r_f32); - g_f32 = perform_avx_gamma_transfer(transfer_function, g_f32); - b_f32 = perform_avx_gamma_transfer(transfer_function, b_f32); - r_f32 = _mm256_mul_ps(r_f32, v_scale_alpha); - g_f32 = _mm256_mul_ps(g_f32, v_scale_alpha); - b_f32 = _mm256_mul_ps(b_f32, v_scale_alpha); - if USE_ALPHA { - a_f32 = _mm256_mul_ps(a_f32, v_scale_alpha); - } - ( - _mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0>(a_f32)), - ) -} - -#[target_feature(enable = "avx2")] -pub unsafe fn avx_linear_to_gamma( - start_cx: usize, - src: *const f32, - src_offset: u32, - dst: *mut u8, - dst_offset: u32, - width: u32, - transfer_function: TransferFunction, -) -> usize { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let channels = image_configuration.get_channels_count(); - let mut cx = start_cx; - - while cx + 32 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = - gamma_vld::(src_ptr_0, transfer_function); - - let src_ptr_1 = offset_src_ptr.add(8 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - gamma_vld::(src_ptr_1, transfer_function); - - let src_ptr_2 = offset_src_ptr.add(8 * 2 * channels); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = - gamma_vld::(src_ptr_2, transfer_function); - - let src_ptr_3 = offset_src_ptr.add(8 * 3 * channels); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = - gamma_vld::(src_ptr_3, transfer_function); - - let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, r_row2_, r_row3_); - let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, g_row2_, g_row3_); - let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, b_row2_, b_row3_); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if USE_ALPHA { - let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, a_row2_, a_row3_); - avx_store_and_interleave_v4_u8!( - dst_ptr, - image_configuration, - r_row, - g_row, - b_row, - a_row - ); - } else { - avx_store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); - } - - cx += 32; - } - - let zeros = _mm256_setzero_si256(); - - while cx + 16 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = - gamma_vld::(src_ptr_0, transfer_function); - - let src_ptr_1 = offset_src_ptr.add(8 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - gamma_vld::(src_ptr_1, transfer_function); - - let r_row01 = avx2_pack_u32(r_row0_, r_row1_); - let g_row01 = avx2_pack_u32(g_row0_, g_row1_); - let b_row01 = avx2_pack_u32(b_row0_, b_row1_); - - let r_row = avx2_pack_u16(r_row01, zeros); - let g_row = avx2_pack_u16(g_row01, zeros); - let b_row = avx2_pack_u16(b_row01, zeros); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if USE_ALPHA { - let a_row01 = avx2_pack_u32(a_row0_, a_row1_); - let a_row = avx2_pack_u16(a_row01, zeros); - avx_store_and_interleave_v4_half_u8!( - dst_ptr, - image_configuration, - r_row, - g_row, - b_row, - a_row - ); - } else { - avx_store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); - } - - cx += 16; - } - - cx -} diff --git a/src/avx/mod.rs b/src/avx/mod.rs index 8d56940..919d88c 100644 --- a/src/avx/mod.rs +++ b/src/avx/mod.rs @@ -9,13 +9,11 @@ mod cie; mod from_sigmoidal; mod gamma_curves; mod image_to_oklab; -mod linear_to_image; mod math; mod oklab_to_image; mod routines; mod sigmoidal; mod support; -mod to_linear; mod to_sigmoidal; mod to_xyz_lab; mod utils; @@ -24,11 +22,9 @@ mod xyza_laba_to_image; pub use from_sigmoidal::avx_from_sigmoidal_row; pub use image_to_oklab::avx_image_to_oklab; -pub use linear_to_image::avx_linear_to_gamma; pub use math::*; pub use oklab_to_image::avx_oklab_to_image; pub use support::*; -pub use to_linear::avx_channels_to_linear; pub use to_sigmoidal::avx_image_to_sigmoidal_row; pub use to_xyz_lab::*; pub use utils::*; diff --git a/src/avx/oklab_to_image.rs b/src/avx/oklab_to_image.rs index b40d587..c590277 100644 --- a/src/avx/oklab_to_image.rs +++ b/src/avx/oklab_to_image.rs @@ -11,24 +11,16 @@ use std::arch::x86_64::*; use erydanos::{_mm256_cos_ps, _mm256_sin_ps}; -use crate::avx::gamma_curves::perform_avx_gamma_transfer; use crate::avx::routines::avx_vld_f32_and_deinterleave_direct; -use crate::avx::{ - _mm256_color_matrix_ps, _mm256_cube_ps, _mm256_packus_four_epi32, avx2_interleave_rgb, - avx2_interleave_rgba_epi8, -}; +use crate::avx::{_mm256_color_matrix_ps, _mm256_cube_ps}; +use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps}; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; -use crate::{ - avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8, - avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8, - avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction, -}; +use crate::{avx_store_and_interleave_v3_f32, avx_store_and_interleave_v4_f32}; #[inline(always)] unsafe fn avx_oklab_vld( src: *const f32, - transfer_function: TransferFunction, oklab_target: OklabTarget, m0: __m256, m1: __m256, @@ -48,11 +40,8 @@ unsafe fn avx_oklab_vld( c6: __m256, c7: __m256, c8: __m256, -) -> (__m256i, __m256i, __m256i, __m256i) { - let v_scale_alpha = _mm256_set1_ps(255f32); - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - - let (l, mut a, mut b, mut a_f32) = +) -> (__m256, __m256, __m256, __m256) { + let (l, mut a, mut b, a_f32) = avx_vld_f32_and_deinterleave_direct::(src); if oklab_target == OklabTarget::Oklch { @@ -70,44 +59,17 @@ unsafe fn avx_oklab_vld( l_s = _mm256_cube_ps(l_s); let (r_l, g_l, b_l) = _mm256_color_matrix_ps(l_l, l_m, l_s, c0, c1, c2, c3, c4, c5, c6, c7, c8); - - let mut r_f32 = perform_avx_gamma_transfer(transfer_function, r_l); - let mut g_f32 = perform_avx_gamma_transfer(transfer_function, g_l); - let mut b_f32 = perform_avx_gamma_transfer(transfer_function, b_l); - - r_f32 = _mm256_mul_ps(r_f32, v_scale_alpha); - g_f32 = _mm256_mul_ps(g_f32, v_scale_alpha); - b_f32 = _mm256_mul_ps(b_f32, v_scale_alpha); - if image_configuration.has_alpha() { - a_f32 = _mm256_mul_ps(a_f32, v_scale_alpha); - } - - if image_configuration.has_alpha() { - ( - _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(r_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(g_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(b_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(a_f32)), - ) - } else { - ( - _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(r_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(g_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(b_f32)), - _mm256_set1_epi32(255), - ) - } + (r_l, g_l, b_l, a_f32) } #[target_feature(enable = "avx2")] pub unsafe fn avx_oklab_to_image( start_cx: usize, src: *const f32, - src_offset: u32, - dst: *mut u8, + src_offset: usize, + dst: *mut f32, dst_offset: u32, width: u32, - transfer_function: TransferFunction, ) -> usize { let target: OklabTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -138,218 +100,6 @@ pub unsafe fn avx_oklab_to_image( - src_ptr_0, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_1 = offset_src_ptr.add(8 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::( - src_ptr_1, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_2 = offset_src_ptr.add(8 * 2 * channels); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = avx_oklab_vld::( - src_ptr_2, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_3 = offset_src_ptr.add(8 * 3 * channels); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = avx_oklab_vld::( - src_ptr_3, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, r_row2_, r_row3_); - let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, g_row2_, g_row3_); - let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, b_row2_, b_row3_); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if image_configuration.has_alpha() { - let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, a_row2_, a_row3_); - avx_store_and_interleave_v4_u8!( - dst_ptr, - image_configuration, - r_row, - g_row, - b_row, - a_row - ); - } else { - avx_store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); - } - - cx += 32; - } - - while cx + 16 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::( - src_ptr_0, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_1 = offset_src_ptr.add(8 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::( - src_ptr_1, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, zeros, zeros); - let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, zeros, zeros); - let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, zeros, zeros); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if image_configuration.has_alpha() { - let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, zeros, zeros); - avx_store_and_interleave_v4_half_u8!( - dst_ptr, - image_configuration, - r_row, - g_row, - b_row, - a_row - ); - } else { - avx_store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); - } - - cx += 16; - } - while cx + 8 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); @@ -357,52 +107,28 @@ pub unsafe fn avx_oklab_to_image( - src_ptr_0, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, + src_ptr_0, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, c5, c6, c7, c8, ); - let r_row = _mm256_packus_four_epi32(r_row0_, zeros, zeros, zeros); - let g_row = _mm256_packus_four_epi32(g_row0_, zeros, zeros, zeros); - let b_row = _mm256_packus_four_epi32(b_row0_, zeros, zeros, zeros); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset as usize) as *mut f32).add(cx * channels); if image_configuration.has_alpha() { - let a_row = _mm256_packus_four_epi32(a_row0_, zeros, zeros, zeros); - avx_store_and_interleave_v4_quarter_u8!( + avx_store_and_interleave_v4_f32!( dst_ptr, image_configuration, - r_row, - g_row, - b_row, - a_row + r_row0_, + g_row0_, + b_row0_, + a_row0_ ); } else { - avx_store_and_interleave_v3_quarter_u8!( + avx_store_and_interleave_v3_f32!( dst_ptr, image_configuration, - r_row, - g_row, - b_row + r_row0_, + g_row0_, + b_row0_ ); } diff --git a/src/avx/support.rs b/src/avx/support.rs index 8e31b4e..4e785e7 100644 --- a/src/avx/support.rs +++ b/src/avx/support.rs @@ -452,18 +452,4 @@ pub unsafe fn avx2_div_by255(v: __m256i) -> __m256i { let multiplier = _mm256_set1_epi16(-32640); let r = _mm256_mulhi_epu16(x, multiplier); _mm256_srli_epi16::<7>(r) -} - -#[inline(always)] -pub unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i { - let packed = _mm256_packus_epi16(s_1, s_2); - const MASK: i32 = shuffle(3, 1, 2, 0); - _mm256_permute4x64_epi64::(packed) -} - -#[inline(always)] -pub unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i { - let packed = _mm256_packus_epi32(s_1, s_2); - const MASK: i32 = shuffle(3, 1, 2, 0); - _mm256_permute4x64_epi64::(packed) -} +} \ No newline at end of file diff --git a/src/avx/to_linear.rs b/src/avx/to_linear.rs deleted file mode 100644 index 7b23b2d..0000000 --- a/src/avx/to_linear.rs +++ /dev/null @@ -1,340 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -use crate::avx::gamma_curves::perform_avx2_linear_transfer; -use crate::avx::routines::{ - avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half, - avx_vld_u8_and_deinterleave_quarter, -}; -use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps}; -use crate::gamma_curves::TransferFunction; -use crate::image::ImageConfiguration; -use crate::{avx_store_and_interleave_v3_f32, avx_store_and_interleave_v4_f32}; - -#[inline(always)] -unsafe fn triple_to_linear( - r: __m256i, - g: __m256i, - b: __m256i, - transfer_function: TransferFunction, -) -> (__m256, __m256, __m256) { - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - let r_f = _mm256_mul_ps(_mm256_cvtepi32_ps(r), u8_scale); - let g_f = _mm256_mul_ps(_mm256_cvtepi32_ps(g), u8_scale); - let b_f = _mm256_mul_ps(_mm256_cvtepi32_ps(b), u8_scale); - let r_linear = perform_avx2_linear_transfer(transfer_function, r_f); - let g_linear = perform_avx2_linear_transfer(transfer_function, g_f); - let b_linear = perform_avx2_linear_transfer(transfer_function, b_f); - (r_linear, g_linear, b_linear) -} - -#[target_feature(enable = "avx2")] -pub unsafe fn avx_channels_to_linear( - start_cx: usize, - src: *const u8, - src_offset: usize, - width: u32, - dst: *mut f32, - dst_offset: usize, - transfer_function: TransferFunction, -) -> usize { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let channels = image_configuration.get_channels_count(); - let mut cx = start_cx; - - let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; - - while cx + 32 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); - - let (x_low_low, y_low_low, z_low_low) = - triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function); - - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - if USE_ALPHA { - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low, - a_low_low - ); - } else { - let ptr = dst_ptr.add(cx * 3); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low - ); - } - - let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); - let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); - let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); - - let (x_low_high, y_low_high, z_low_high) = - triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function); - - if USE_ALPHA { - let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4 + 32); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high, - a_low_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 24); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high - ); - } - - let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan)); - let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan)); - let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan)); - - let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high)); - let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high)); - let b_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_high)); - - let (x_high_low, y_high_low, z_high_low) = - triple_to_linear(r_high_low, g_high_low, b_high_low, transfer_function); - - let a_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(a_chan)); - - if USE_ALPHA { - let a_high_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_high))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4 + 64); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_high_low, - y_high_low, - z_high_low, - a_high_low - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 48); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_high_low, - y_high_low, - z_high_low - ); - } - - let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high)); - let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high)); - let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high)); - - let (x_high_high, y_high_high, z_high_high) = - triple_to_linear(r_high_high, g_high_high, b_high_high, transfer_function); - - if USE_ALPHA { - let a_high_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_high))), - u8_scale, - ); - let ptr = dst_ptr.add(cx * 4 + 96); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_high_high, - y_high_high, - z_high_high, - a_high_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 24 * 3); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_high_high, - y_high_high, - z_high_high - ); - } - - cx += 32; - } - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave_half::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); - - let (x_low_low, y_low_low, z_low_low) = - triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function); - - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - if USE_ALPHA { - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low, - a_low_low - ); - } else { - let ptr = dst_ptr.add(cx * 3); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low - ); - } - - let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); - let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); - let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); - - let (x_low_high, y_low_high, z_low_high) = - triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function); - - if USE_ALPHA { - let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4 + 32); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high, - a_low_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 24); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high - ); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave_quarter::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); - - let (x_low_low, y_low_low, z_low_low) = - triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function); - - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - if USE_ALPHA { - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low, - a_low_low - ); - } else { - let ptr = dst_ptr.add(cx * 3); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low - ); - } - cx += 8; - } - - cx -} diff --git a/src/avx/to_xyz_lab.rs b/src/avx/to_xyz_lab.rs index c715145..7e83338 100644 --- a/src/avx/to_xyz_lab.rs +++ b/src/avx/to_xyz_lab.rs @@ -13,12 +13,13 @@ use std::arch::x86_64::*; use crate::avx::cie::{ avx2_triple_to_lab, avx2_triple_to_luv, avx2_triple_to_xyz, avx_triple_to_lch, }; -use crate::avx::routines::{avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half}; +use crate::avx::routines::avx_vld_f32_and_deinterleave; use crate::avx::*; -use crate::avx_store_and_interleave_v3_direct_f32; -use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; +use crate::sse::{sse_interleave_ps_rgb, sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz}; use crate::xyz_target::XyzTarget; +use crate::{avx_store_and_interleave_v3_direct_f32, load_f32_and_deinterleave}; +use crate::sse::{sse_deinterleave_rgba_ps, sse_deinterleave_rgb_ps}; #[target_feature(enable = "avx2")] pub unsafe fn avx2_image_to_xyz_lab< @@ -27,7 +28,7 @@ pub unsafe fn avx2_image_to_xyz_lab< const TARGET: u8, >( start_cx: usize, - src: *const u8, + src: *const f32, src_offset: usize, width: u32, dst: *mut f32, @@ -35,7 +36,6 @@ pub unsafe fn avx2_image_to_xyz_lab< a_linearized: *mut f32, a_offset: usize, matrix: &[[f32; 3]; 3], - transfer_function: TransferFunction, ) -> usize { if USE_ALPHA && a_linearized.is_null() { panic!("Null alpha channel with requirements of linearized alpha if not supported"); @@ -57,267 +57,13 @@ pub unsafe fn avx2_image_to_xyz_lab< let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; - while cx + 32 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); - - let (mut x_low_low, mut y_low_low, mut z_low_low) = avx2_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = avx2_triple_to_lab(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = a; - z_low_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = avx2_triple_to_luv(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = u; - z_low_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = avx_triple_to_lch(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = c; - z_low_low = h; - } - } - - let write_dst_ptr = dst_ptr.add(cx * 3); - avx_store_and_interleave_v3_direct_f32!(write_dst_ptr, x_low_low, y_low_low, z_low_low); - - let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); - let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); - let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); - - let (mut x_low_high, mut y_low_high, mut z_low_high) = avx2_triple_to_xyz( - r_low_high, - g_low_high, - b_low_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = avx2_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = avx2_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = avx_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let ptr2 = write_dst_ptr.add(8 * 3); - avx_store_and_interleave_v3_direct_f32!(ptr2, x_low_high, y_low_high, z_low_high); - - let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan)); - let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan)); - let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan)); - - let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high)); - let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high)); - let b_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_high)); - - let (mut x_high_low, mut y_high_low, mut z_high_low) = avx2_triple_to_xyz( - r_high_low, - g_high_low, - b_high_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = avx2_triple_to_lab(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = a; - z_high_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = avx2_triple_to_luv(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = u; - z_high_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = avx_triple_to_lch(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = c; - z_high_low = h; - } - } - - let ptr3 = write_dst_ptr.add(8 * 3 * 2); - avx_store_and_interleave_v3_direct_f32!(ptr3, x_high_low, y_high_low, z_high_low); - - let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high)); - let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high)); - let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high)); - - let (mut x_high_high, mut y_high_high, mut z_high_high) = avx2_triple_to_xyz( - r_high_high, - g_high_high, - b_high_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = avx2_triple_to_lab(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = a; - z_high_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = avx2_triple_to_luv(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = u; - z_high_high = v; - } - XyzTarget::Lch => { - let (l, u, v) = avx_triple_to_lch(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = u; - z_high_high = v; - } - } - - let ptr4 = write_dst_ptr.add(8 * 3 * 3); - avx_store_and_interleave_v3_direct_f32!(ptr4, x_high_high, y_high_high, z_high_high); - - if USE_ALPHA { - let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32; - - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - - _mm256_storeu_ps(a_ptr.add(cx), a_low_low); - - let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), - u8_scale, - ); - - _mm256_storeu_ps(a_ptr.add(cx + 8), a_low_high); - - let a_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(a_chan)); - - let a_high_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_high))), - u8_scale, - ); - - _mm256_storeu_ps(a_ptr.add(cx + 8 * 2), a_high_low); - - let a_high_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_high))), - u8_scale, - ); - - _mm256_storeu_ps(a_ptr.add(cx + 8 * 3), a_high_high); - } - - cx += 32; - } - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); + while cx + 8 < width as usize { + let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave_half::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); + avx_vld_f32_and_deinterleave::(src_ptr); let (mut x_low_low, mut y_low_low, mut z_low_low) = avx2_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, + r_chan, g_chan, b_chan, cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9, ); match target { @@ -345,145 +91,69 @@ pub unsafe fn avx2_image_to_xyz_lab< let write_dst_ptr = dst_ptr.add(cx * 3); avx_store_and_interleave_v3_direct_f32!(write_dst_ptr, x_low_low, y_low_low, z_low_low); - let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); - let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); - let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); - let (mut x_low_high, mut y_low_high, mut z_low_high) = avx2_triple_to_xyz( - r_low_high, - g_low_high, - b_low_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = avx2_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = avx2_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = avx_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let ptr2 = write_dst_ptr.add(8 * 3); - avx_store_and_interleave_v3_direct_f32!(ptr2, x_low_high, y_low_high, z_low_high); - if USE_ALPHA { let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32; - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - - _mm256_storeu_ps(a_ptr.add(cx), a_low_low); - - let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), - u8_scale, - ); - - _mm256_storeu_ps(a_ptr.add(cx + 8), a_low_high); + _mm256_storeu_ps(a_ptr.add(cx), a_chan); } - cx += 16; + cx += 8; } - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); + while cx + 4 < width as usize { + let src_ptr = ((src as * const u8).add(src_offset) as *const f32).add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave_half::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); - - let (mut x_low_low, mut y_low_low, mut z_low_low) = avx2_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, + load_f32_and_deinterleave!(src_ptr, image_configuration); + + let (mut x_low_low, mut y_low_low, mut z_low_low) = sse_triple_to_xyz( + r_chan, + g_chan, + b_chan, + _mm256_castps256_ps128(cq1), + _mm256_castps256_ps128(cq2), + _mm256_castps256_ps128(cq3), + _mm256_castps256_ps128(cq4), + _mm256_castps256_ps128(cq5), + _mm256_castps256_ps128(cq6), + _mm256_castps256_ps128(cq7), + _mm256_castps256_ps128(cq8), + _mm256_castps256_ps128(cq9), ); match target { XyzTarget::Lab => { - let (l, a, b) = avx2_triple_to_lab(x_low_low, y_low_low, z_low_low); + let (l, a, b) = sse_triple_to_lab(x_low_low, y_low_low, z_low_low); x_low_low = l; y_low_low = a; z_low_low = b; } XyzTarget::Xyz => {} XyzTarget::Luv => { - let (l, u, v) = avx2_triple_to_luv(x_low_low, y_low_low, z_low_low); + let (l, u, v) = sse_triple_to_luv(x_low_low, y_low_low, z_low_low); x_low_low = l; y_low_low = u; z_low_low = v; } XyzTarget::Lch => { - let (l, c, h) = avx_triple_to_lch(x_low_low, y_low_low, z_low_low); + let (l, c, h) = sse_triple_to_lch(x_low_low, y_low_low, z_low_low); x_low_low = l; y_low_low = c; z_low_low = h; } } - let write_dst_ptr = dst_ptr.add(cx * 3); - avx_store_and_interleave_v3_direct_f32!(write_dst_ptr, x_low_low, y_low_low, z_low_low); + let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_low, y_low_low, z_low_low); + _mm_storeu_ps(dst_ptr.add(cx * 3), v0); + _mm_storeu_ps(dst_ptr.add(cx * 3 + 4), v1); + _mm_storeu_ps(dst_ptr.add(cx * 3 + 8), v2); if USE_ALPHA { let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32; - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - - _mm256_storeu_ps(a_ptr.add(cx), a_low_low); + _mm_storeu_ps(a_ptr.add(cx), a_chan); } - cx += 8; + cx += 4; } cx diff --git a/src/avx/xyz_lab_to_image.rs b/src/avx/xyz_lab_to_image.rs index 78a6844..95e3a6a 100644 --- a/src/avx/xyz_lab_to_image.rs +++ b/src/avx/xyz_lab_to_image.rs @@ -11,17 +11,15 @@ use std::arch::x86::*; use std::arch::x86_64::*; use crate::avx::cie::{avx_lab_to_xyz, avx_lch_to_xyz, avx_luv_to_xyz}; -use crate::avx::gamma_curves::perform_avx_gamma_transfer; -use crate::avx::{ - _mm256_color_matrix_ps, _mm256_packus_four_epi32, avx2_deinterleave_rgb_ps, - avx2_interleave_rgb, avx2_interleave_rgba_epi8, -}; +use crate::avx::{_mm256_color_matrix_ps, avx2_deinterleave_rgb_ps}; +use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps}; use crate::image::ImageConfiguration; +use crate::sse::sse_xyz_lab_vld; +use crate::sse::{sse_interleave_ps_rgb, sse_interleave_ps_rgba}; use crate::xyz_target::XyzTarget; use crate::{ - avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8, - avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8, - avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction, + avx_store_and_interleave_v3_f32, avx_store_and_interleave_v4_f32, store_and_interleave_v3_f32, + store_and_interleave_v4_f32, }; #[inline(always)] @@ -31,7 +29,6 @@ unsafe fn avx_xyz_lab_vld< const TARGET: u8, >( src: *const f32, - transfer_function: TransferFunction, c1: __m256, c2: __m256, c3: __m256, @@ -41,9 +38,8 @@ unsafe fn avx_xyz_lab_vld< c7: __m256, c8: __m256, c9: __m256, -) -> (__m256i, __m256i, __m256i) { +) -> (__m256, __m256, __m256) { let target: XyzTarget = TARGET.into(); - let v_scale_color = _mm256_set1_ps(255f32); let lab_pixel_0 = _mm256_loadu_ps(src); let lab_pixel_1 = _mm256_loadu_ps(src.add(8)); let lab_pixel_2 = _mm256_loadu_ps(src.add(16)); @@ -75,21 +71,7 @@ unsafe fn avx_xyz_lab_vld< let (linear_r, linear_g, linear_b) = _mm256_color_matrix_ps(r_f32, g_f32, b_f32, c1, c2, c3, c4, c5, c6, c7, c8, c9); - r_f32 = linear_r; - g_f32 = linear_g; - b_f32 = linear_b; - - r_f32 = perform_avx_gamma_transfer(transfer_function, r_f32); - g_f32 = perform_avx_gamma_transfer(transfer_function, g_f32); - b_f32 = perform_avx_gamma_transfer(transfer_function, b_f32); - r_f32 = _mm256_mul_ps(r_f32, v_scale_color); - g_f32 = _mm256_mul_ps(g_f32, v_scale_color); - b_f32 = _mm256_mul_ps(b_f32, v_scale_color); - ( - _mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)), - ) + (linear_r, linear_g, linear_b) } #[target_feature(enable = "avx2")] @@ -103,11 +85,10 @@ pub unsafe fn avx_xyz_to_channels< src_offset: usize, a_channel: *const f32, a_offset: usize, - dst: *mut u8, + dst: *mut f32, dst_offset: usize, width: u32, matrix: &[[f32; 3]; 3], - transfer_function: TransferFunction, ) -> usize { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); if USE_ALPHA && !image_configuration.has_alpha() { @@ -130,256 +111,81 @@ pub unsafe fn avx_xyz_to_channels< const CHANNELS: usize = 3usize; - let color_rescale = _mm256_set1_ps(255f32); - - while cx + 32 < width as usize { + while cx + 8 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); let src_ptr_0 = offset_src_ptr; let (r_row0_, g_row0_, b_row0_) = avx_xyz_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS); - - let (r_row1_, g_row1_, b_row1_) = - avx_xyz_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_2 = offset_src_ptr.add(8 * 2 * CHANNELS); - - let (r_row2_, g_row2_, b_row2_) = - avx_xyz_lab_vld::( - src_ptr_2, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_3 = offset_src_ptr.add(8 * 3 * CHANNELS); - - let (r_row3_, g_row3_, b_row3_) = - avx_xyz_lab_vld::( - src_ptr_3, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, + src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ); - let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, r_row2_, r_row3_); - let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, g_row2_, g_row3_); - let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, b_row2_, b_row3_); - - let dst_ptr = dst.add(dst_offset + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels); if USE_ALPHA { let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); - let a_low_0_f = _mm256_loadu_ps(offset_a_src_ptr); - let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps( - a_low_0_f, - color_rescale, - ))); - - let a_low_1_f = _mm256_loadu_ps(offset_a_src_ptr.add(8)); - let a_row1_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps( - a_low_1_f, - color_rescale, - ))); - - let a_low_2_f = _mm256_loadu_ps(offset_a_src_ptr.add(16)); - let a_row2_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps( - a_low_2_f, - color_rescale, - ))); - - let a_low_3_f = _mm256_loadu_ps(offset_a_src_ptr.add(24)); - let a_row3_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps( - a_low_3_f, - color_rescale, - ))); + let a_row = _mm256_loadu_ps(offset_a_src_ptr); - let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, a_row2_, a_row3_); - avx_store_and_interleave_v4_u8!( + avx_store_and_interleave_v4_f32!( dst_ptr, image_configuration, - r_row, - g_row, - b_row, + r_row0_, + g_row0_, + b_row0_, a_row ); } else { - avx_store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); - } - - cx += 32; - } - - let zeros = _mm256_setzero_si256(); - - while cx + 16 < width as usize { - let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_) = - avx_xyz_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS); - - let (r_row1_, g_row1_, b_row1_) = - avx_xyz_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, zeros, zeros); - let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, zeros, zeros); - let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, zeros, zeros); - - let dst_ptr = dst.add(dst_offset + cx * channels); - - if USE_ALPHA { - let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); - let a_low_0_f = _mm256_loadu_ps(offset_a_src_ptr); - let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps( - a_low_0_f, - color_rescale, - ))); - - let a_low_1_f = _mm256_loadu_ps(offset_a_src_ptr.add(8)); - let a_row1_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps( - a_low_1_f, - color_rescale, - ))); - - let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, zeros, zeros); - avx_store_and_interleave_v4_half_u8!( + avx_store_and_interleave_v3_f32!( dst_ptr, image_configuration, - r_row, - g_row, - b_row, - a_row + r_row0_, + g_row0_, + b_row0_ ); - } else { - avx_store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); } - cx += 16; + cx += 8; } - while cx + 8 < width as usize { - let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); + while cx + 4 < width as usize { + let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels); let src_ptr_0 = offset_src_ptr; let (r_row0_, g_row0_, b_row0_) = - avx_xyz_lab_vld::( + sse_xyz_lab_vld::( src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, + _mm256_castps256_ps128(c1), + _mm256_castps256_ps128(c2), + _mm256_castps256_ps128(c3), + _mm256_castps256_ps128(c4), + _mm256_castps256_ps128(c5), + _mm256_castps256_ps128(c6), + _mm256_castps256_ps128(c7), + _mm256_castps256_ps128(c8), + _mm256_castps256_ps128(c9), ); - let r_row = _mm256_packus_four_epi32(r_row0_, zeros, zeros, zeros); - let g_row = _mm256_packus_four_epi32(g_row0_, zeros, zeros, zeros); - let b_row = _mm256_packus_four_epi32(b_row0_, zeros, zeros, zeros); - - let dst_ptr = dst.add(dst_offset + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels); if USE_ALPHA { let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); - let a_low_0_f = _mm256_loadu_ps(offset_a_src_ptr); - let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps( - a_low_0_f, - color_rescale, - ))); + let a_row = _mm_loadu_ps(offset_a_src_ptr); - let a_row = _mm256_packus_four_epi32(a_row0_, zeros, zeros, zeros); - avx_store_and_interleave_v4_quarter_u8!( + store_and_interleave_v4_f32!( dst_ptr, image_configuration, - r_row, - g_row, - b_row, + r_row0_, + g_row0_, + b_row0_, a_row ); } else { - avx_store_and_interleave_v3_quarter_u8!( - dst_ptr, - image_configuration, - r_row, - g_row, - b_row - ); + store_and_interleave_v3_f32!(dst_ptr, image_configuration, r_row0_, g_row0_, b_row0_); } - cx += 8; + cx += 4; } cx diff --git a/src/avx/xyza_laba_to_image.rs b/src/avx/xyza_laba_to_image.rs index f4cdbce..e83ec89 100644 --- a/src/avx/xyza_laba_to_image.rs +++ b/src/avx/xyza_laba_to_image.rs @@ -5,26 +5,22 @@ * // license that can be found in the LICENSE file. */ -use crate::avx::{_mm256_packus_four_epi32, avx2_interleave_rgb}; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; use crate::avx::cie::{avx_lab_to_xyz, avx_lch_to_xyz, avx_luv_to_xyz}; -use crate::avx::gamma_curves::perform_avx_gamma_transfer; -use crate::avx::{_mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8}; +use crate::avx::{_mm256_color_matrix_ps, avx2_deinterleave_rgba_ps}; +use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps}; +use crate::avx_store_and_interleave_v4_f32; use crate::image::ImageConfiguration; +use crate::sse::{sse_interleave_ps_rgba, sse_xyza_lab_vld}; use crate::xyz_target::XyzTarget; -use crate::{ - avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_quarter_u8, - avx_store_and_interleave_v4_u8, TransferFunction, -}; #[inline(always)] unsafe fn avx_xyza_lab_vld( src: *const f32, - transfer_function: TransferFunction, c1: __m256, c2: __m256, c3: __m256, @@ -34,9 +30,8 @@ unsafe fn avx_xyza_lab_vld( c7: __m256, c8: __m256, c9: __m256, -) -> (__m256i, __m256i, __m256i, __m256i) { +) -> (__m256, __m256, __m256, __m256) { let target: XyzTarget = TARGET.into(); - let v_scale_color = _mm256_set1_ps(255f32); let pixel_0 = _mm256_loadu_ps(src); let pixel_1 = _mm256_loadu_ps(src.add(8)); let pixel_2 = _mm256_loadu_ps(src.add(16)); @@ -69,23 +64,7 @@ unsafe fn avx_xyza_lab_vld( let (linear_r, linear_g, linear_b) = _mm256_color_matrix_ps(r_f32, g_f32, b_f32, c1, c2, c3, c4, c5, c6, c7, c8, c9); - r_f32 = linear_r; - g_f32 = linear_g; - b_f32 = linear_b; - - r_f32 = perform_avx_gamma_transfer(transfer_function, r_f32); - g_f32 = perform_avx_gamma_transfer(transfer_function, g_f32); - b_f32 = perform_avx_gamma_transfer(transfer_function, b_f32); - r_f32 = _mm256_mul_ps(r_f32, v_scale_color); - g_f32 = _mm256_mul_ps(g_f32, v_scale_color); - b_f32 = _mm256_mul_ps(b_f32, v_scale_color); - let a_f32 = _mm256_mul_ps(a_f32, v_scale_color); - ( - _mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)), - _mm256_cvtps_epi32(_mm256_round_ps::<0>(a_f32)), - ) + (linear_r, linear_g, linear_b, a_f32) } #[target_feature(enable = "sse4.1")] @@ -93,11 +72,10 @@ pub unsafe fn avx_xyza_to_image usize { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); if !image_configuration.has_alpha() { @@ -120,177 +98,64 @@ pub unsafe fn avx_xyza_to_image( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_xyza_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_2 = offset_src_ptr.add(8 * 2 * CHANNELS); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = avx_xyza_lab_vld::( - src_ptr_2, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_3 = offset_src_ptr.add(8 * 3 * CHANNELS); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = avx_xyza_lab_vld::( - src_ptr_3, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, r_row2_, r_row3_); - let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, g_row2_, g_row3_); - let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, b_row2_, b_row3_); - let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, a_row2_, a_row3_); - - let dst_ptr = dst.add(dst_offset + cx * channels); - - avx_store_and_interleave_v4_u8!(dst_ptr, image_configuration, r_row, g_row, b_row, a_row); - - cx += 32; - } - - while cx + 16 < width as usize { + while cx + 8 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); let src_ptr_0 = offset_src_ptr; let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_xyza_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_xyza_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, + src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ); - let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, zeros, zeros); - let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, zeros, zeros); - let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, zeros, zeros); - let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, zeros, zeros); - - let dst_ptr = dst.add(dst_offset + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels); - avx_store_and_interleave_v4_half_u8!( + avx_store_and_interleave_v4_f32!( dst_ptr, image_configuration, - r_row, - g_row, - b_row, - a_row + r_row0_, + g_row0_, + b_row0_, + a_row0_ ); - cx += 16; + cx += 8; } - while cx + 8 < width as usize { + while cx + 4 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); let src_ptr_0 = offset_src_ptr; - let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_xyza_lab_vld::( + let (r_row0_, g_row0_, b_row0_, a_row0_) = sse_xyza_lab_vld::( src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, + _mm256_castps256_ps128(c1), + _mm256_castps256_ps128(c2), + _mm256_castps256_ps128(c3), + _mm256_castps256_ps128(c4), + _mm256_castps256_ps128(c5), + _mm256_castps256_ps128(c6), + _mm256_castps256_ps128(c7), + _mm256_castps256_ps128(c8), + _mm256_castps256_ps128(c9), ); - let r_row = _mm256_packus_four_epi32(r_row0_, zeros, zeros, zeros); - let g_row = _mm256_packus_four_epi32(g_row0_, zeros, zeros, zeros); - let b_row = _mm256_packus_four_epi32(b_row0_, zeros, zeros, zeros); - let a_row = _mm256_packus_four_epi32(a_row0_, zeros, zeros, zeros); + let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels); - let dst_ptr = dst.add(dst_offset + cx * channels); + let (rgba0, rgba1, rgba2, rgba3) = match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Rgba => { + sse_interleave_ps_rgba(r_row0_, g_row0_, b_row0_, a_row0_) + } + ImageConfiguration::Bgra | ImageConfiguration::Bgr => { + sse_interleave_ps_rgba(b_row0_, g_row0_, r_row0_, a_row0_) + } + }; - avx_store_and_interleave_v4_quarter_u8!( - dst_ptr, - image_configuration, - r_row, - g_row, - b_row, - a_row - ); + _mm_storeu_ps(dst_ptr, rgba0); + _mm_storeu_ps(dst_ptr.add(4), rgba1); + _mm_storeu_ps(dst_ptr.add(8), rgba2); + _mm_storeu_ps(dst_ptr.add(12), rgba3); - cx += 8; + cx += 4; } cx diff --git a/src/image_to_linear.rs b/src/image_to_linear.rs index 62d19ec..0e9c7bc 100644 --- a/src/image_to_linear.rs +++ b/src/image_to_linear.rs @@ -4,14 +4,8 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::avx::avx_channels_to_linear; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::neon_channels_to_linear; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::sse::*; use crate::Rgb; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; @@ -37,23 +31,9 @@ fn channels_to_linear( let channels = image_configuration.get_channels_count(); - let mut _wide_row_handle: Option< - unsafe fn(usize, *const u8, usize, u32, *mut f32, usize, TransferFunction) -> usize, - > = None; - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if std::arch::is_x86_feature_detected!("sse4.1") { - _wide_row_handle = Some(sse_channels_to_linear::); - } - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if std::arch::is_x86_feature_detected!("avx2") { - _wide_row_handle = Some(avx_channels_to_linear::); - } - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - { - _wide_row_handle = Some(neon_channels_to_linear::); + let mut lut_table = vec![0f32; 256]; + for i in 0..256 { + lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0)); } #[cfg(not(feature = "rayon"))] @@ -63,20 +43,6 @@ fn channels_to_linear( for _ in 0..height as usize { let mut _cx = 0usize; - if let Some(dispatcher) = _wide_row_handle { - unsafe { - _cx = dispatcher( - _cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - transfer_function, - ) - } - } - let src_ptr = unsafe { src.as_ptr().add(src_offset) }; let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; @@ -98,14 +64,13 @@ fn channels_to_linear( }; let rgb = Rgb::::new(r, g, b); - let rgb_f32 = rgb.to_rgb_f32(); unsafe { - dst.write_unaligned(transfer_function.linearize(rgb_f32.r)); + dst.write_unaligned(*lut_table.get_unchecked(rgb.r as usize)); dst.add(1) - .write_unaligned(transfer_function.linearize(rgb_f32.g)); + .write_unaligned(*lut_table.get_unchecked(rgb.g as usize)); dst.add(2) - .write_unaligned(transfer_function.linearize(rgb_f32.b)); + .write_unaligned(*lut_table.get_unchecked(rgb.b as usize)); } if USE_ALPHA && image_configuration.has_alpha() { @@ -139,18 +104,6 @@ fn channels_to_linear( .for_each(|(dst_row, src_row)| unsafe { let mut _cx = 0usize; - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher( - _cx, - src_row.as_ptr(), - 0, - width, - dst_row.as_mut_ptr() as *mut f32, - 0, - transfer_function, - ) - } - let src_ptr = src_row.as_ptr(); let dst_ptr = dst_row.as_mut_ptr() as *mut f32; @@ -169,13 +122,12 @@ fn channels_to_linear( .read_unaligned(); let rgb = Rgb::::new(r, g, b); - let rgb_f32 = rgb.to_rgb_f32(); - dst.write_unaligned(transfer_function.linearize(rgb_f32.r)); + dst.write_unaligned(*lut_table.get_unchecked(rgb.r as usize)); dst.add(1) - .write_unaligned(transfer_function.linearize(rgb_f32.g)); + .write_unaligned(*lut_table.get_unchecked(rgb.g as usize)); dst.add(2) - .write_unaligned(transfer_function.linearize(rgb_f32.b)); + .write_unaligned(*lut_table.get_unchecked(rgb.b as usize)); if USE_ALPHA && image_configuration.has_alpha() { let a = src diff --git a/src/image_to_linear_u8.rs b/src/image_to_linear_u8.rs index 19e3502..84b41ee 100644 --- a/src/image_to_linear_u8.rs +++ b/src/image_to_linear_u8.rs @@ -6,10 +6,6 @@ */ use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::*; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::sse::sse_image_to_linear_unsigned::sse_channels_to_linear_u8; use crate::Rgb; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; @@ -33,20 +29,11 @@ fn channels_to_linear( let channels = image_configuration.get_channels_count(); - let mut _wide_row_handler: Option< - unsafe fn(usize, *const u8, usize, u32, *mut u8, usize, TransferFunction) -> usize, - > = None; - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - { - _wide_row_handler = - Some(neon_channels_to_linear_u8::); - } - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if std::arch::is_x86_feature_detected!("sse4.1") { - _wide_row_handler = - Some(sse_channels_to_linear_u8::); + let mut lut_table = vec![0u8; 256]; + for i in 0..256 { + lut_table[i] = (transfer_function.linearize(i as f32 * (1. / 255.0)) * 255.) + .ceil() + .min(255.) as u8; } #[cfg(not(feature = "rayon"))] @@ -56,20 +43,6 @@ fn channels_to_linear( { let mut _cx = 0usize; - if let Some(dispatcher) = _wide_row_handler { - unsafe { - _cx = dispatcher( - _cx, - src_row.as_ptr(), - 0, - width, - dst_row.as_mut_ptr(), - 0, - transfer_function, - ) - } - } - for x in _cx..width as usize { let px = x * channels; let r = @@ -85,9 +58,9 @@ fn channels_to_linear( let rgb = rgb_f32.to_u8(); unsafe { - *dst_row.get_unchecked_mut(px) = rgb.r; - *dst_row.get_unchecked_mut(px + 1) = rgb.g; - *dst_row.get_unchecked_mut(px + 2) = rgb.b; + *dst_row.get_unchecked_mut(px) = *lut_table.get_unchecked(rgb.r as usize); + *dst_row.get_unchecked_mut(px + 1) = *lut_table.get_unchecked(rgb.g as usize); + *dst_row.get_unchecked_mut(px + 2) = *lut_table.get_unchecked(rgb.b as usize); } if USE_ALPHA && image_configuration.has_alpha() { @@ -109,18 +82,6 @@ fn channels_to_linear( .for_each(|(dst_row, src_row)| unsafe { let mut _cx = 0usize; - if let Some(dispatcher) = _wide_row_handler { - _cx = dispatcher( - _cx, - src_row.as_ptr(), - 0, - width, - dst_row.as_mut_ptr(), - 0, - transfer_function, - ) - } - for x in _cx..width as usize { let px = x * channels; let r = *src_row.get_unchecked(px + image_configuration.get_r_channel_offset()); @@ -128,13 +89,10 @@ fn channels_to_linear( let b = *src_row.get_unchecked(px + image_configuration.get_b_channel_offset()); let rgb = Rgb::::new(r, g, b); - let mut rgb_f32 = rgb.to_rgb_f32(); - rgb_f32 = rgb_f32.linearize(transfer_function); - let rgb = rgb_f32.to_u8(); - *dst_row.get_unchecked_mut(px) = rgb.r; - *dst_row.get_unchecked_mut(px + 1) = rgb.g; - *dst_row.get_unchecked_mut(px + 2) = rgb.b; + *dst_row.get_unchecked_mut(px) = *lut_table.get_unchecked(rgb.r as usize); + *dst_row.get_unchecked_mut(px + 1) = *lut_table.get_unchecked(rgb.g as usize); + *dst_row.get_unchecked_mut(px + 2) = *lut_table.get_unchecked(rgb.b as usize); if USE_ALPHA && image_configuration.has_alpha() { let a = diff --git a/src/image_to_oklab.rs b/src/image_to_oklab.rs index 25ceb67..90197b6 100644 --- a/src/image_to_oklab.rs +++ b/src/image_to_oklab.rs @@ -12,12 +12,13 @@ use crate::neon::neon_image_to_oklab; use crate::oklch::Oklch; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::sse_image_to_oklab; -use crate::{Oklab, Rgb, TransferFunction}; +use crate::{ + bgr_to_linear, bgra_to_linear, rgb_to_linear, rgba_to_linear, Oklab, Rgb, TransferFunction, +}; #[cfg(feature = "rayon")] -use rayon::iter::{IndexedParallelIterator, ParallelIterator}; -#[cfg(feature = "rayon")] -use rayon::prelude::{ParallelSlice, ParallelSliceMut}; +use rayon::iter::ParallelIterator; #[cfg(feature = "rayon")] +use rayon::prelude::ParallelSliceMut; use std::slice; #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] @@ -53,9 +54,24 @@ fn channels_to_oklab( let channels = image_configuration.get_channels_count(); - let mut _wide_row_handle: Option< - unsafe fn(usize, *const u8, usize, u32, *mut f32, usize, TransferFunction) -> usize, - > = None; + let callee = match image_configuration { + ImageConfiguration::Rgb => rgb_to_linear, + ImageConfiguration::Rgba => rgba_to_linear, + ImageConfiguration::Bgra => bgra_to_linear, + ImageConfiguration::Bgr => bgr_to_linear, + }; + + callee( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); + + let mut _wide_row_handle: Option usize> = None; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { @@ -72,32 +88,30 @@ fn channels_to_oklab( _wide_row_handle = Some(avx_image_to_oklab::); } + let dst_slice_safe_align = unsafe { + slice::from_raw_parts_mut( + dst.as_mut_ptr() as *mut u8, + dst_stride as usize * height as usize, + ) + }; + #[cfg(feature = "rayon")] { - let dst_slice_safe_align = unsafe { - slice::from_raw_parts_mut( - dst.as_mut_ptr() as *mut u8, - dst_stride as usize * height as usize, - ) - }; - dst_slice_safe_align .par_chunks_exact_mut(dst_stride as usize) - .zip(src.par_chunks_exact(src_stride as usize)) - .for_each(|(dst, src)| unsafe { + .for_each(|dst| unsafe { let mut _cx = 0usize; - let src_ptr = src.as_ptr(); let dst_ptr = dst.as_mut_ptr() as *mut f32; if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher(_cx, src.as_ptr(), 0, width, dst_ptr, 0, transfer_function) + _cx = dispatcher(_cx, width, dst_ptr, 0) } for x in _cx..width as usize { let px = x * channels; - let src = src_ptr.add(px); + let src = dst_ptr.add(px); let r = src .add(image_configuration.get_r_channel_offset()) .read_unaligned(); @@ -108,18 +122,18 @@ fn channels_to_oklab( .add(image_configuration.get_b_channel_offset()) .read_unaligned(); - let rgb = Rgb::::new(r, g, b); + let rgb = Rgb::::new(r, g, b); let dst_store = dst_ptr.add(px); match target { OklabTarget::Oklab => { - let oklab = Oklab::from_rgb(rgb, transfer_function); + let oklab = Oklab::from_linear_rgb(rgb); dst_store.write_unaligned(oklab.l); dst_store.add(1).write_unaligned(oklab.a); dst_store.add(2).write_unaligned(oklab.b); } OklabTarget::Oklch => { - let oklch = Oklch::from_rgb(rgb, transfer_function); + let oklch = Oklch::from_linear_rgb(rgb); dst_store.write_unaligned(oklch.l); dst_store.add(1).write_unaligned(oklch.c); dst_store.add(2).write_unaligned(oklch.h); @@ -130,8 +144,7 @@ fn channels_to_oklab( let a = src .add(image_configuration.get_a_channel_offset()) .read_unaligned(); - let a_lin = a as f32 * (1f32 / 255f32); - dst_store.add(3).write_unaligned(a_lin); + dst_store.add(3).write_unaligned(a); } } }); @@ -139,82 +152,56 @@ fn channels_to_oklab( #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; - - for _ in 0..height as usize { - let mut _cx = 0usize; - - let src_ptr = unsafe { src.as_ptr().add(src_offset) }; - let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; - - if let Some(dispatcher) = _wide_row_handle { - unsafe { - _cx = dispatcher( - _cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - transfer_function, - ) + for dst in dst_slice_safe_align.chunks_exact_mut(dst_stride as usize) { + unsafe { + let mut _cx = 0usize; + + let dst_ptr = dst.as_mut_ptr() as *mut f32; + + if let Some(dispatcher) = _wide_row_handle { + _cx = dispatcher(_cx, width, dst_ptr, 0) } - } - for x in _cx..width as usize { - let px = x * channels; - - let src = unsafe { src_ptr.add(px) }; - let r = unsafe { - src.add(image_configuration.get_r_channel_offset()) - .read_unaligned() - }; - let g = unsafe { - src.add(image_configuration.get_g_channel_offset()) - .read_unaligned() - }; - let b = unsafe { - src.add(image_configuration.get_b_channel_offset()) - .read_unaligned() - }; - - let rgb = Rgb::::new(r, g, b); - let dst_store = unsafe { dst_ptr.add(px) }; - - match target { - OklabTarget::Oklab => { - let oklab = Oklab::from_rgb(rgb, transfer_function); - unsafe { + for x in _cx..width as usize { + let px = x * channels; + + let src = dst_ptr.add(px); + let r = src + .add(image_configuration.get_r_channel_offset()) + .read_unaligned(); + let g = src + .add(image_configuration.get_g_channel_offset()) + .read_unaligned(); + let b = src + .add(image_configuration.get_b_channel_offset()) + .read_unaligned(); + + let rgb = Rgb::::new(r, g, b); + let dst_store = dst_ptr.add(px); + + match target { + OklabTarget::Oklab => { + let oklab = Oklab::from_linear_rgb(rgb); dst_store.write_unaligned(oklab.l); dst_store.add(1).write_unaligned(oklab.a); dst_store.add(2).write_unaligned(oklab.b); } - } - OklabTarget::Oklch => { - let oklch = Oklch::from_rgb(rgb, transfer_function); - unsafe { + OklabTarget::Oklch => { + let oklch = Oklch::from_linear_rgb(rgb); dst_store.write_unaligned(oklch.l); dst_store.add(1).write_unaligned(oklch.c); dst_store.add(2).write_unaligned(oklch.h); } } - } - if image_configuration.has_alpha() { - let a = unsafe { - src.add(image_configuration.get_a_channel_offset()) - .read_unaligned() - }; - let a_lin = a as f32 * (1f32 / 255f32); - unsafe { - dst_store.add(3).write_unaligned(a_lin); + if image_configuration.has_alpha() { + let a = src + .add(image_configuration.get_a_channel_offset()) + .read_unaligned(); + dst_store.add(3).write_unaligned(a); } } } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; } } } diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs index 39d0368..5617d10 100644 --- a/src/image_to_xyz_lab.rs +++ b/src/image_to_xyz_lab.rs @@ -13,12 +13,11 @@ use crate::neon::neon_channels_to_xyz_or_lab; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::sse_channels_to_xyz_or_lab; use crate::xyz_target::XyzTarget; -use crate::{Rgb, Xyz, SRGB_TO_XYZ_D65}; +use crate::{LCh, Lab, Luv, Rgb, Xyz, SRGB_TO_XYZ_D65}; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[allow(clippy::type_complexity)] @@ -45,7 +44,7 @@ fn channels_to_xyz usize, > = None; @@ -75,15 +73,20 @@ fn channels_to_xyz); } + let mut lut_table = vec![0f32; 256]; + for i in 0..256 { + lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0)); + } + + let dst_slice_safe_align = unsafe { + slice::from_raw_parts_mut( + dst.as_mut_ptr() as *mut u8, + dst_stride as usize * height as usize, + ) + }; + #[cfg(feature = "rayon")] { - let dst_slice_safe_align = unsafe { - slice::from_raw_parts_mut( - dst.as_mut_ptr() as *mut u8, - dst_stride as usize * height as usize, - ) - }; - if USE_ALPHA { let a_slice_safe_align = unsafe { slice::from_raw_parts_mut( @@ -99,10 +102,22 @@ fn channels_to_xyz::new(r, g, b); + let src = transient_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + + let rgb = Rgb::::new(r, g, b); let ptr = dst_ptr.add(x * 3); + + let xyz = Xyz::from_linear_rgb(rgb, matrix); + match target { XyzTarget::Lab => { - let lab = rgb.to_lab(); + let lab = Lab::from_xyz(xyz); ptr.write_unaligned(lab.l); ptr.add(1).write_unaligned(lab.a); ptr.add(2).write_unaligned(lab.b); } XyzTarget::Xyz => { - let xyz = Xyz::from_rgb(rgb, matrix, transfer_function); ptr.write_unaligned(xyz.x); ptr.add(1).write_unaligned(xyz.y); ptr.add(2).write_unaligned(xyz.z); } XyzTarget::Luv => { - let luv = rgb.to_luv(); + let luv = Luv::from_xyz(xyz); ptr.write_unaligned(luv.l); ptr.add(1).write_unaligned(luv.u); ptr.add(2).write_unaligned(luv.v); } XyzTarget::Lch => { - let lch = rgb.to_lch(); + let luv = Luv::from_xyz(xyz); + let lch = LCh::from_luv(luv); ptr.write_unaligned(lch.l); ptr.add(1).write_unaligned(lch.c); ptr.add(2).write_unaligned(lch.h); @@ -160,12 +170,9 @@ fn channels_to_xyz::new(r, g, b); + let src = transient_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + + let rgb = Rgb::::new(r, g, b); let ptr = dst_ptr.add(x * 3); + + let xyz = Xyz::from_linear_rgb(rgb, matrix); + match target { XyzTarget::Lab => { - let lab = rgb.to_lab(); + let lab = Lab::from_xyz(xyz); ptr.write_unaligned(lab.l); ptr.add(1).write_unaligned(lab.a); ptr.add(2).write_unaligned(lab.b); } XyzTarget::Xyz => { - let xyz = Xyz::from_rgb(rgb, matrix, transfer_function); ptr.write_unaligned(xyz.x); ptr.add(1).write_unaligned(xyz.y); ptr.add(2).write_unaligned(xyz.z); } XyzTarget::Luv => { - let luv = rgb.to_luv(); + let luv = Luv::from_xyz(xyz); ptr.write_unaligned(luv.l); ptr.add(1).write_unaligned(luv.u); ptr.add(2).write_unaligned(luv.v); } XyzTarget::Lch => { - let lch = rgb.to_lch(); + let luv = Luv::from_xyz(xyz); + let lch = LCh::from_luv(luv); ptr.write_unaligned(lch.l); ptr.add(1).write_unaligned(lch.c); ptr.add(2).write_unaligned(lch.h); @@ -242,103 +255,173 @@ fn channels_to_xyz::new(r, g, b); - let ptr = unsafe { dst_ptr.add(x * 3) }; - match target { - XyzTarget::Lab => { - let lab = rgb.to_lab(); - unsafe { - ptr.write_unaligned(lab.l); - ptr.add(1).write_unaligned(lab.a); - ptr.add(2).write_unaligned(lab.b); - } + let mut transient_row = vec![0f32; width as usize * channels]; + + for (dst_chunk, src_chunks) in transient_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + dst_chunk[0] = *lut_table.get_unchecked(src_chunks[0] as usize); + dst_chunk[1] = *lut_table.get_unchecked(src_chunks[1] as usize); + dst_chunk[2] = *lut_table.get_unchecked(src_chunks[2] as usize); + dst_chunk[3] = src_chunks[3] as f32 * (1. / 255.0); } - XyzTarget::Xyz => { - let xyz = Xyz::from_rgb(rgb, matrix, transfer_function); - unsafe { - ptr.write_unaligned(xyz.x); - ptr.add(1).write_unaligned(xyz.y); - ptr.add(2).write_unaligned(xyz.z); - } + + if let Some(dispatcher) = _wide_row_handler { + _cx = dispatcher( + _cx, + transient_row.as_ptr(), + 0, + width, + dst.as_mut_ptr() as *mut f32, + 0, + a_channel.as_mut_ptr() as *mut f32, + 0, + matrix, + ); } - XyzTarget::Luv => { - let luv = rgb.to_luv(); - unsafe { - ptr.write_unaligned(luv.l); - ptr.add(1).write_unaligned(luv.u); - ptr.add(2).write_unaligned(luv.v); + + let dst_ptr = dst.as_mut_ptr().add(0) as *mut f32; + + for x in _cx..width as usize { + let px = x * channels; + let src = transient_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + + let rgb = Rgb::::new(r, g, b); + let ptr = dst_ptr.add(x * 3); + + let xyz = Xyz::from_linear_rgb(rgb, matrix); + + match target { + XyzTarget::Lab => { + let lab = Lab::from_xyz(xyz); + ptr.write_unaligned(lab.l); + ptr.add(1).write_unaligned(lab.a); + ptr.add(2).write_unaligned(lab.b); + } + XyzTarget::Xyz => { + ptr.write_unaligned(xyz.x); + ptr.add(1).write_unaligned(xyz.y); + ptr.add(2).write_unaligned(xyz.z); + } + XyzTarget::Luv => { + let luv = Luv::from_xyz(xyz); + ptr.write_unaligned(luv.l); + ptr.add(1).write_unaligned(luv.u); + ptr.add(2).write_unaligned(luv.v); + } + XyzTarget::Lch => { + let luv = Luv::from_xyz(xyz); + let lch = LCh::from_luv(luv); + ptr.write_unaligned(lch.l); + ptr.add(1).write_unaligned(lch.c); + ptr.add(2).write_unaligned(lch.h); + } } - } - XyzTarget::Lch => { - let lch = rgb.to_lch(); - unsafe { - ptr.write_unaligned(lch.l); - ptr.add(1).write_unaligned(lch.c); - ptr.add(2).write_unaligned(lch.h); + + if USE_ALPHA && image_configuration.has_alpha() { + let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); + let a_ptr = a_channel.as_mut_ptr() as *mut f32; + a_ptr.add(x).write_unaligned(a); } } } + } + } else { + for (dst, src) in dst_slice_safe_align + .chunks_exact_mut(dst_stride as usize) + .zip(src.chunks_exact(src_stride as usize)) + { + unsafe { + let mut _cx = 0usize; + + let mut transient_row = vec![0f32; width as usize * channels]; + + for (dst_chunk, src_chunks) in transient_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + dst_chunk[0] = *lut_table.get_unchecked(src_chunks[0] as usize); + dst_chunk[1] = *lut_table.get_unchecked(src_chunks[1] as usize); + dst_chunk[2] = *lut_table.get_unchecked(src_chunks[2] as usize); + } + + if let Some(dispatcher) = _wide_row_handler { + _cx = dispatcher( + _cx, + transient_row.as_ptr(), + 0, + width, + dst.as_mut_ptr() as *mut f32, + 0, + std::ptr::null_mut(), + 0, + matrix, + ); + } + + let dst_ptr = dst.as_mut_ptr().add(0) as *mut f32; + + for x in _cx..width as usize { + let px = x * channels; + let src = transient_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + + let rgb = Rgb::::new(r, g, b); + let ptr = dst_ptr.add(x * 3); - if USE_ALPHA && image_configuration.has_alpha() { - let a = unsafe { - src.add(image_configuration.get_a_channel_offset()) - .read_unaligned() - }; - let a_lin = a as f32 * (1f32 / 255f32); - let a_ptr = - unsafe { (a_channel.as_mut_ptr() as *mut u8).add(a_offset) as *mut f32 }; - unsafe { - a_ptr.add(x).write_unaligned(a_lin); + let xyz = Xyz::from_linear_rgb(rgb, matrix); + + match target { + XyzTarget::Lab => { + let lab = Lab::from_xyz(xyz); + ptr.write_unaligned(lab.l); + ptr.add(1).write_unaligned(lab.a); + ptr.add(2).write_unaligned(lab.b); + } + XyzTarget::Xyz => { + ptr.write_unaligned(xyz.x); + ptr.add(1).write_unaligned(xyz.y); + ptr.add(2).write_unaligned(xyz.z); + } + XyzTarget::Luv => { + let luv = Luv::from_xyz(xyz); + ptr.write_unaligned(luv.l); + ptr.add(1).write_unaligned(luv.u); + ptr.add(2).write_unaligned(luv.v); + } + XyzTarget::Lch => { + let luv = Luv::from_xyz(xyz); + let lch = LCh::from_luv(luv); + ptr.write_unaligned(lch.l); + ptr.add(1).write_unaligned(lch.c); + ptr.add(2).write_unaligned(lch.h); + } + } } } } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; - a_offset += a_stride as usize; } } } diff --git a/src/image_xyza_laba.rs b/src/image_xyza_laba.rs index 1e97acb..4d6f051 100644 --- a/src/image_xyza_laba.rs +++ b/src/image_xyza_laba.rs @@ -11,12 +11,11 @@ use crate::neon::neon_channels_to_xyza_or_laba; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::sse_channels_to_xyza_laba; use crate::xyz_target::XyzTarget; -use crate::{Rgb, TransferFunction, Xyz, SRGB_TO_XYZ_D65}; +use crate::{LCh, Lab, Luv, Rgb, TransferFunction, Xyz, SRGB_TO_XYZ_D65}; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[allow(clippy::type_complexity)] @@ -37,16 +36,7 @@ fn channels_to_xyz_with_alpha usize, + unsafe fn(usize, *const f32, usize, u32, *mut f32, usize, &[[f32; 3]; 3]) -> usize, > = None; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -63,180 +53,177 @@ fn channels_to_xyz_with_alpha::new(r, g, b); + let src = transient_row.get_unchecked(px..); + + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + + let rgb = Rgb::::new(r, g, b); let px = x * channels; let dst_store = dst_ptr.add(px); + + let xyz = Xyz::from_linear_rgb(rgb, matrix); + match target { XyzTarget::Lab => { - let lab = rgb.to_lab(); + let lab = Lab::from_xyz(xyz); dst_store.write_unaligned(lab.l); dst_store.add(1).write_unaligned(lab.a); dst_store.add(2).write_unaligned(lab.b); } XyzTarget::Xyz => { - let xyz = Xyz::from_rgb(rgb, matrix, transfer_function); dst_store.write_unaligned(xyz.x); dst_store.add(1).write_unaligned(xyz.y); dst_store.add(2).write_unaligned(xyz.z); } XyzTarget::Luv => { - let luv = rgb.to_luv(); + let luv = Luv::from_xyz(xyz); dst_store.write_unaligned(luv.l); dst_store.add(1).write_unaligned(luv.u); dst_store.add(2).write_unaligned(luv.v); } XyzTarget::Lch => { - let lch = rgb.to_lch(); + let luv = Luv::from_xyz(xyz); + let lch = LCh::from_luv(luv); dst_store.write_unaligned(lch.l); dst_store.add(1).write_unaligned(lch.c); dst_store.add(2).write_unaligned(lch.h); } } - - let a = src - .add(image_configuration.get_a_channel_offset()) - .read_unaligned(); - let a_lin = a as f32 * (1f32 / 255f32); - dst_store.add(3).write_unaligned(a_lin); + let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); + dst_store.add(3).write_unaligned(a); } }); } #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; + for (dst, src) in dst_slice_safe_align + .chunks_exact_mut(dst_stride as usize) + .zip(src.chunks_exact(src_stride as usize)) + { + unsafe { + let mut _cx = 0usize; - for _ in 0..height as usize { - let mut _cx = 0usize; + let mut transient_row = vec![0f32; width as usize * channels]; - if let Some(dispatcher) = _wide_row_handler { - unsafe { + for (dst_chunk, src_chunks) in transient_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + dst_chunk[0] = *lut_table.get_unchecked(src_chunks[0] as usize); + dst_chunk[1] = *lut_table.get_unchecked(src_chunks[1] as usize); + dst_chunk[2] = *lut_table.get_unchecked(src_chunks[2] as usize); + dst_chunk[3] = src_chunks[3] as f32 * (1. / 255.0); + } + + if let Some(dispatcher) = _wide_row_handler { _cx = dispatcher( _cx, - src.as_ptr(), - src_offset, + transient_row.as_ptr(), + 0, width, - dst.as_mut_ptr(), - dst_offset, + dst.as_mut_ptr() as *mut f32, + 0, matrix, - transfer_function, ); } - } - let src_ptr = unsafe { src.as_ptr().add(src_offset) }; - let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; - - for x in _cx..width as usize { - let px = x * channels; - let src = unsafe { src_ptr.add(px) }; - let r = unsafe { - src.add(image_configuration.get_r_channel_offset()) - .read_unaligned() - }; - let g = unsafe { - src.add(image_configuration.get_g_channel_offset()) - .read_unaligned() - }; - let b = unsafe { - src.add(image_configuration.get_b_channel_offset()) - .read_unaligned() - }; - - let rgb = Rgb::::new(r, g, b); - let px = x * channels; - let dst_store = unsafe { dst_ptr.add(px) }; - match target { - XyzTarget::Lab => { - let lab = rgb.to_lab(); - unsafe { + let dst_ptr = dst.as_mut_ptr() as *mut f32; + + for x in _cx..width as usize { + let px = x * channels; + let src = transient_row.get_unchecked(px..); + + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + + let rgb = Rgb::::new(r, g, b); + let px = x * channels; + let dst_store = dst_ptr.add(px); + + let xyz = Xyz::from_linear_rgb(rgb, matrix); + + match target { + XyzTarget::Lab => { + let lab = Lab::from_xyz(xyz); dst_store.write_unaligned(lab.l); dst_store.add(1).write_unaligned(lab.a); dst_store.add(2).write_unaligned(lab.b); } - } - XyzTarget::Xyz => { - let xyz = Xyz::from_rgb(rgb, matrix, transfer_function); - unsafe { + XyzTarget::Xyz => { dst_store.write_unaligned(xyz.x); dst_store.add(1).write_unaligned(xyz.y); dst_store.add(2).write_unaligned(xyz.z); } - } - XyzTarget::Luv => { - let luv = rgb.to_luv(); - unsafe { + XyzTarget::Luv => { + let luv = Luv::from_xyz(xyz); dst_store.write_unaligned(luv.l); dst_store.add(1).write_unaligned(luv.u); dst_store.add(2).write_unaligned(luv.v); } - } - XyzTarget::Lch => { - let lch = rgb.to_lch(); - unsafe { + XyzTarget::Lch => { + let luv = Luv::from_xyz(xyz); + let lch = LCh::from_luv(luv); dst_store.write_unaligned(lch.l); dst_store.add(1).write_unaligned(lch.c); dst_store.add(2).write_unaligned(lch.h); } } - } - - let a = unsafe { - src.add(image_configuration.get_a_channel_offset()) - .read_unaligned() - }; - let a_lin = a as f32 * (1f32 / 255f32); - unsafe { - dst_store.add(3).write_unaligned(a_lin); + let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); + dst_store.add(3).write_unaligned(a); } } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; } } } diff --git a/src/lab.rs b/src/lab.rs index 5bd9b32..587a32a 100644 --- a/src/lab.rs +++ b/src/lab.rs @@ -85,7 +85,7 @@ impl Lab { } impl Lab { - /// Converts CIE Lab into CIE XYZ + /// Converts CIE [Lab] into CIE [Xyz] #[inline] pub fn to_xyz(&self) -> Xyz { let y = (self.l + 16.0) / 116.0; @@ -125,6 +125,13 @@ impl Lab { Xyz::new(xyz.x, xyz.y, xyz.z).to_srgb() } + /// Converts CIE [Lab] into linear [Rgb] + #[inline] + pub fn to_linear_rgb(&self, matrix: &[[f32; 3]; 3]) -> Rgb { + let xyz = self.to_xyz(); + xyz.to_linear_rgb(matrix) + } + /// Converts CIE Lab into Rgb #[inline] pub fn to_rgb(&self) -> Rgb { diff --git a/src/linear_to_image.rs b/src/linear_to_image.rs index e74a35d..0a76bba 100644 --- a/src/linear_to_image.rs +++ b/src/linear_to_image.rs @@ -4,20 +4,13 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::avx::avx_linear_to_gamma; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::neon_linear_to_gamma; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::sse::sse_linear_to_gamma; use crate::Rgb; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[allow(clippy::type_complexity)] @@ -35,52 +28,29 @@ fn linear_to_gamma_channels usize, - > = None; - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - { - _wide_row_handle = Some(neon_linear_to_gamma::); - } - let channels = image_configuration.get_channels_count(); - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if std::arch::is_x86_feature_detected!("sse4.1") { - _wide_row_handle = Some(sse_linear_to_gamma::); + let mut lut_table = vec![0u8; 2049]; + for i in 0..2049 { + lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.) + .ceil() + .min(255.) as u8; } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if std::arch::is_x86_feature_detected!("avx2") { - _wide_row_handle = Some(avx_linear_to_gamma::); - } + let src_slice_safe_align = unsafe { + slice::from_raw_parts( + src.as_ptr() as *const u8, + src_stride as usize * height as usize, + ) + }; #[cfg(feature = "rayon")] { - let src_slice_safe_align = unsafe { - slice::from_raw_parts( - src.as_ptr() as *const u8, - src_stride as usize * height as usize, - ) - }; dst.par_chunks_exact_mut(dst_stride as usize) .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) .for_each(|(dst, src)| unsafe { let mut _cx = 0usize; - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher( - _cx, - src.as_ptr() as *const f32, - 0, - dst.as_mut_ptr(), - 0, - width, - transfer_function, - ); - } - let src_ptr = src.as_ptr() as *const f32; let dst_ptr = dst.as_mut_ptr(); @@ -97,19 +67,21 @@ fn linear_to_gamma_channels::new( + let rgb = (Rgb::::new( r.min(1f32).max(0f32), g.min(1f32).max(0f32), b.min(1f32).max(0f32), - ); + ) * Rgb::::dup(2048f32)) + .round() + .cast::(); let dst = dst_ptr.add(px); - let transferred = rgb.gamma(transfer_function); - let rgb8 = transferred.to_u8(); - dst.write_unaligned(rgb8.r); - dst.add(1).write_unaligned(rgb8.g); - dst.add(2).write_unaligned(rgb8.b); + dst.write_unaligned(*lut_table.get_unchecked(rgb.r.min(2048) as usize)); + dst.add(1) + .write_unaligned(*lut_table.get_unchecked(rgb.g.min(2048) as usize)); + dst.add(2) + .write_unaligned(*lut_table.get_unchecked(rgb.b.min(2048) as usize)); if USE_ALPHA && image_configuration.has_alpha() { let a = src_slice @@ -124,79 +96,54 @@ fn linear_to_gamma_channels::new( - r.min(1f32).max(0f32), - g.min(1f32).max(0f32), - b.min(1f32).max(0f32), - ); + let rgb = (Rgb::::new( + r.min(1f32).max(0f32), + g.min(1f32).max(0f32), + b.min(1f32).max(0f32), + ) * Rgb::::dup(2048f32)) + .round() + .cast::(); - let dst = unsafe { dst_ptr.add(px) }; - let transferred = rgb.gamma(transfer_function); - let rgb8 = transferred.to_u8(); + let dst = dst_ptr.add(px); - unsafe { - dst.write_unaligned(rgb8.r); - dst.add(1).write_unaligned(rgb8.g); - dst.add(2).write_unaligned(rgb8.b); - } + dst.write_unaligned(*lut_table.get_unchecked(rgb.r.min(2048) as usize)); + dst.add(1) + .write_unaligned(*lut_table.get_unchecked(rgb.g.min(2048) as usize)); + dst.add(2) + .write_unaligned(*lut_table.get_unchecked(rgb.b.min(2048) as usize)); - if USE_ALPHA && image_configuration.has_alpha() { - let a = unsafe { - src_slice + if USE_ALPHA && image_configuration.has_alpha() { + let a = src_slice .add(image_configuration.get_a_channel_offset()) - .read_unaligned() - }; - let a_lin = (a * 255f32).round() as u8; - unsafe { + .read_unaligned(); + let a_lin = (a * 255f32).round() as u8; dst.add(3).write_unaligned(a_lin); } } } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; } } } diff --git a/src/linear_to_image_u8.rs b/src/linear_to_image_u8.rs index b3cdd31..2cf1a36 100644 --- a/src/linear_to_image_u8.rs +++ b/src/linear_to_image_u8.rs @@ -7,10 +7,6 @@ use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::*; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::sse::sse_image_to_linear_unsigned::sse_channels_to_linear_u8; use crate::Rgb; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; @@ -36,20 +32,11 @@ fn linear_to_gamma_channels usize, - > = None; - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if std::arch::is_x86_feature_detected!("sse4.1") { - _wide_row_handler = - Some(sse_channels_to_linear_u8::); - } - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - { - _wide_row_handler = - Some(neon_channels_to_linear_u8::); + let mut lut_table = vec![0u8; 256]; + for i in 0..256 { + lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 255.0)) * 255.) + .ceil() + .min(255.) as u8; } #[cfg(feature = "rayon")] @@ -59,18 +46,6 @@ fn linear_to_gamma_channels::new(r, g, b); - let mut rgb = rgb.to_rgb_f32(); - rgb = rgb.gamma(transfer_function); - let new_rgb = rgb.to_u8(); - - *dst.get_unchecked_mut(px) = new_rgb.r; - *dst.get_unchecked_mut(px + 1) = new_rgb.g; - *dst.get_unchecked_mut(px + 2) = new_rgb.b; + *dst.get_unchecked_mut(px) = *lut_table.get_unchecked(rgb.r as usize); + *dst.get_unchecked_mut(px + 1) = *lut_table.get_unchecked(rgb.g as usize); + *dst.get_unchecked_mut(px + 2) = *lut_table.get_unchecked(rgb.b as usize); if USE_ALPHA && image_configuration.has_alpha() { let a = src.get_unchecked(px + image_configuration.get_a_channel_offset()); @@ -103,20 +74,6 @@ fn linear_to_gamma_channels) -> Self { let xyz = Xyz::from_srgb(rgb); + Self::from_xyz(xyz) + } + + /// Converts CIE XYZ to CIE Luv + #[inline] + #[allow(clippy::manual_clamp)] + pub fn from_xyz(xyz: Xyz) -> Self { let [x, y, z] = [xyz.x, xyz.y, xyz.z]; let den = x + 15.0 * y + 3.0 * z; @@ -106,9 +113,9 @@ impl Luv { } #[inline] - pub fn to_rgb(&self) -> Rgb { + pub fn to_xyz(&self) -> Xyz { if self.l <= 0f32 { - return Xyz::new(0f32, 0f32, 0f32).to_srgb(); + return Xyz::new(0f32, 0f32, 0f32); } let l13 = 1f32 / (13f32 * self.l); let u = self.u * l13 + LUV_WHITE_U_PRIME; @@ -128,7 +135,20 @@ impl Luv { z = 0f32; } - Xyz::new(x, y, z).to_srgb() + Xyz::new(x, y, z) + } + + /// Converts CIE [Luv] into linear [Rgb] + #[inline] + pub fn to_linear_rgb(&self, matrix: &[[f32; 3]; 3]) -> Rgb { + let xyz = self.to_xyz(); + xyz.to_linear_rgb(matrix) + } + + #[inline] + pub fn to_rgb(&self) -> Rgb { + let xyz = self.to_xyz(); + xyz.to_srgb() } pub fn new(l: f32, u: f32, v: f32) -> Luv { @@ -169,10 +189,23 @@ impl LCh { } } + #[inline] pub fn to_rgb(&self) -> Rgb { self.to_luv().to_rgb() } + #[inline] + pub fn to_xyz(&self) -> Xyz { + self.to_luv().to_xyz() + } + + #[inline] + pub fn to_linear_rgb(&self, matrix: &[[f32; 3]; 3]) -> Rgb { + let xyz = self.to_xyz(); + xyz.to_linear_rgb(matrix) + } + + #[inline] pub fn to_luv(&self) -> Luv { Luv { l: self.l, diff --git a/src/neon/cie.rs b/src/neon/cie.rs index 49bfcc7..9456929 100644 --- a/src/neon/cie.rs +++ b/src/neon/cie.rs @@ -10,16 +10,14 @@ use crate::luv::{ LUV_WHITE_V_PRIME, }; use crate::neon::math::{prefer_vfmaq_f32, vcolorq_matrix_f32, vcubeq_f32}; -use crate::neon::neon_perform_linear_transfer; -use crate::TransferFunction; use erydanos::{vatan2q_f32, vcbrtq_fast_f32, vcosq_f32, vhypotq_fast_f32, vsinq_f32}; use std::arch::aarch64::*; #[inline(always)] pub(crate) unsafe fn neon_triple_to_xyz( - r: uint32x4_t, - g: uint32x4_t, - b: uint32x4_t, + r: float32x4_t, + g: float32x4_t, + b: float32x4_t, c1: float32x4_t, c2: float32x4_t, c3: float32x4_t, @@ -29,17 +27,9 @@ pub(crate) unsafe fn neon_triple_to_xyz( c7: float32x4_t, c8: float32x4_t, c9: float32x4_t, - transfer_function: TransferFunction, ) -> (float32x4_t, float32x4_t, float32x4_t) { - let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32); - let g_f = vmulq_n_f32(vcvtq_f32_u32(g), 1f32 / 255f32); - let b_f = vmulq_n_f32(vcvtq_f32_u32(b), 1f32 / 255f32); - let r_linear = neon_perform_linear_transfer(transfer_function, r_f); - let g_linear = neon_perform_linear_transfer(transfer_function, g_f); - let b_linear = neon_perform_linear_transfer(transfer_function, b_f); - let (x, y, z) = vcolorq_matrix_f32( - r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9, + r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9, ); (x, y, z) } diff --git a/src/neon/image_to_oklab.rs b/src/neon/image_to_oklab.rs index 151e936..9e72371 100644 --- a/src/neon/image_to_oklab.rs +++ b/src/neon/image_to_oklab.rs @@ -7,11 +7,7 @@ use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; use crate::neon::math::vcolorq_matrix_f32; -use crate::neon::neon_perform_linear_transfer; -use crate::{ - load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter, - TransferFunction, -}; +use crate::load_f32_and_deinterleave; use erydanos::{vatan2q_f32, vcbrtq_fast_f32, vhypotq_fast_f32}; use std::arch::aarch64::*; @@ -20,15 +16,8 @@ macro_rules! triple_to_oklab { $c0:expr, $c1:expr, $c2: expr, $c3: expr, $c4:expr, $c5: expr, $c6:expr, $c7: expr, $c8: expr, $m0: expr, $m1: expr, $m2: expr, $m3: expr, $m4: expr, $m5: expr, $m6: expr, $m7: expr, $m8: expr ) => {{ - let r_f = vmulq_n_f32(vcvtq_f32_u32($r), 1f32 / 255f32); - let g_f = vmulq_n_f32(vcvtq_f32_u32($g), 1f32 / 255f32); - let b_f = vmulq_n_f32(vcvtq_f32_u32($b), 1f32 / 255f32); - let dl_l = neon_perform_linear_transfer($transfer, r_f); - let dl_m = neon_perform_linear_transfer($transfer, g_f); - let dl_s = neon_perform_linear_transfer($transfer, b_f); - let (l_l, l_m, l_s) = vcolorq_matrix_f32( - dl_l, dl_m, dl_s, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8, + $r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8, ); let l_ = vcbrtq_fast_f32(l_l); @@ -52,12 +41,9 @@ macro_rules! triple_to_oklab { #[inline(always)] pub unsafe fn neon_image_to_oklab( start_cx: usize, - src: *const u8, - src_offset: usize, width: u32, dst: *mut f32, dst_offset: usize, - transfer_function: TransferFunction, ) -> usize { let target: OklabTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -90,297 +76,15 @@ pub unsafe fn neon_image_to_oklab( - src: *const f32, - transfer_function: TransferFunction, -) -> (uint32x4_t, uint32x4_t, uint32x4_t, uint32x4_t) { - let v_scale_alpha = vdupq_n_f32(255f32); - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (mut r_f32, mut g_f32, mut b_f32, mut a_f32) = - load_f32_and_deinterleave!(src, image_configuration); - - r_f32 = neon_perform_gamma_transfer(transfer_function, r_f32); - g_f32 = neon_perform_gamma_transfer(transfer_function, g_f32); - b_f32 = neon_perform_gamma_transfer(transfer_function, b_f32); - r_f32 = vmulq_f32(r_f32, v_scale_alpha); - g_f32 = vmulq_f32(g_f32, v_scale_alpha); - b_f32 = vmulq_f32(b_f32, v_scale_alpha); - if USE_ALPHA { - a_f32 = vmulq_f32(a_f32, v_scale_alpha); - } - ( - vcvtaq_u32_f32(r_f32), - vcvtaq_u32_f32(g_f32), - vcvtaq_u32_f32(b_f32), - vcvtaq_u32_f32(a_f32), - ) -} - -pub unsafe fn neon_linear_to_gamma( - start_cx: usize, - src: *const f32, - src_offset: u32, - dst: *mut u8, - dst_offset: u32, - width: u32, - transfer_function: TransferFunction, -) -> usize { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let channels = image_configuration.get_channels_count(); - let mut cx = start_cx; - - while cx + 16 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = - neon_gamma_vld::(src_ptr_0, transfer_function); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - neon_gamma_vld::(src_ptr_1, transfer_function); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = - neon_gamma_vld::(src_ptr_2, transfer_function); - - let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = - neon_gamma_vld::(src_ptr_3, transfer_function); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - - let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_)); - let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_)); - let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_)); - - let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23)); - let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23)); - let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23)); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if USE_ALPHA { - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_)); - let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23)); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x16x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x16x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4q_u8(dst_ptr, store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x16x3_t(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x16x3_t(b_row, g_row, r_row) - } - }; - vst3q_u8(dst_ptr, store_rows); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = - neon_gamma_vld::(src_ptr_0, transfer_function); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - neon_gamma_vld::(src_ptr_1, transfer_function); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - - let r_row = vqmovn_u16(r_row01); - let g_row = vqmovn_u16(g_row01); - let b_row = vqmovn_u16(b_row01); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if USE_ALPHA { - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - let a_row = vqmovn_u16(a_row01); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4_u8(dst_ptr, store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x3_t(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x3_t(b_row, g_row, r_row) - } - }; - vst3_u8(dst_ptr, store_rows); - } - - cx += 8; - } - - while cx + 4 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = - neon_gamma_vld::(src_ptr_0, transfer_function); - - let zero = vdup_n_u16(0); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), zero); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), zero); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), zero); - - let r_row = vqmovn_u16(r_row01); - let g_row = vqmovn_u16(g_row01); - let b_row = vqmovn_u16(b_row01); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if USE_ALPHA { - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), zero); - let a_row = vqmovn_u16(a_row01); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) - } - }; - let mut transient: [u8; 32] = [0; 32]; - vst4_u8(transient.as_mut_ptr(), store_rows); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x3_t(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x3_t(b_row, g_row, r_row) - } - }; - let mut transient: [u8; 24] = [0; 24]; - vst3_u8(transient.as_mut_ptr(), store_rows); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 3); - } - - cx += 4; - } - - cx -} diff --git a/src/neon/mod.rs b/src/neon/mod.rs index c7c20f9..22c8d9d 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -14,15 +14,12 @@ mod image_to_hsv; mod image_to_jzazbz; mod image_to_oklab; mod jzazbz_to_image; -mod linear_to_image; pub mod linear_to_planar; mod math; mod oklab_to_image; pub mod planar_to_linear; mod routines; mod sigmoidal; -mod to_linear; -mod to_linear_u8; mod to_sigmoidal; mod to_xyz_lab; mod to_xyza_laba; @@ -37,10 +34,7 @@ pub use image_to_hsv::*; pub use image_to_jzazbz::neon_image_to_jzazbz; pub use image_to_oklab::neon_image_to_oklab; pub use jzazbz_to_image::neon_jzazbz_to_image; -pub use linear_to_image::*; pub use oklab_to_image::neon_oklab_to_image; -pub use to_linear::*; -pub use to_linear_u8::*; pub use to_sigmoidal::neon_image_to_sigmoidal; pub use to_xyz_lab::*; pub use to_xyza_laba::*; diff --git a/src/neon/oklab_to_image.rs b/src/neon/oklab_to_image.rs index e883cda..ca94daf 100644 --- a/src/neon/oklab_to_image.rs +++ b/src/neon/oklab_to_image.rs @@ -11,13 +11,11 @@ use erydanos::{vcosq_f32, vsinq_f32}; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; use crate::neon::math::vcolorq_matrix_f32; -use crate::neon::neon_perform_gamma_transfer; -use crate::{load_f32_and_deinterleave_direct, TransferFunction}; +use crate::load_f32_and_deinterleave_direct; #[inline(always)] unsafe fn neon_oklab_gamma_vld( src: *const f32, - transfer_function: TransferFunction, m0: float32x4_t, m1: float32x4_t, m2: float32x4_t, @@ -36,11 +34,10 @@ unsafe fn neon_oklab_gamma_vld (uint32x4_t, uint32x4_t, uint32x4_t, uint32x4_t) { +) -> (float32x4_t, float32x4_t, float32x4_t, float32x4_t) { let target: OklabTarget = TARGET.into(); - let v_scale_alpha = vdupq_n_f32(255f32); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (l, mut a, mut b, mut a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration); + let (l, mut a, mut b, a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration); if target == OklabTarget::Oklch { let a0 = vmulq_f32(a, vcosq_f32(b)); @@ -57,34 +54,17 @@ unsafe fn neon_oklab_gamma_vld( start_cx: usize, src: *const f32, - src_offset: u32, - dst: *mut u8, + src_offset: usize, + dst: *mut f32, dst_offset: u32, width: u32, - transfer_function: TransferFunction, ) -> usize { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); let channels = image_configuration.get_channels_count(); @@ -114,259 +94,13 @@ pub unsafe fn neon_oklab_to_image( - src_ptr_0, - transfer_function, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - neon_oklab_gamma_vld::( - src_ptr_1, - transfer_function, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = - neon_oklab_gamma_vld::( - src_ptr_2, - transfer_function, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = - neon_oklab_gamma_vld::( - src_ptr_3, - transfer_function, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - - let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_)); - let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_)); - let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_)); - - let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23)); - let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23)); - let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23)); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if image_configuration.has_alpha() { - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_)); - let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23)); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x16x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x16x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4q_u8(dst_ptr, store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x16x3_t(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x16x3_t(b_row, g_row, r_row) - } - }; - vst3q_u8(dst_ptr, store_rows); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = - neon_oklab_gamma_vld::( - src_ptr_0, - transfer_function, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - neon_oklab_gamma_vld::( - src_ptr_1, - transfer_function, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - - let r_row = vqmovn_u16(r_row01); - let g_row = vqmovn_u16(g_row01); - let b_row = vqmovn_u16(b_row01); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if image_configuration.has_alpha() { - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - let a_row = vqmovn_u16(a_row01); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4_u8(dst_ptr, store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x3_t(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x3_t(b_row, g_row, r_row) - } - }; - vst3_u8(dst_ptr, store_rows); - } - - cx += 8; - } - while cx + 4 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; + let v_src_ptr = + ((src as *mut u8).add(src_offset) as *mut f32).add(cx * channels); let (r_row0_, g_row0_, b_row0_, a_row0_) = neon_oklab_gamma_vld::( - src_ptr_0, - transfer_function, + v_src_ptr, m0, m1, m2, @@ -387,44 +121,29 @@ pub unsafe fn neon_oklab_to_image { - uint8x8x4_t(r_row, g_row, b_row, a_row) + float32x4x4_t(r_row0_, g_row0_, b_row0_, a_row0_) } ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) + float32x4x4_t(b_row0_, g_row0_, r_row0_, a_row0_) } }; - let mut transient: [u8; 32] = [0; 32]; - vst4_u8(transient.as_mut_ptr(), store_rows); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4); + vst4q_f32(in_place_ptr, store_rows); } else { let store_rows = match image_configuration { ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x3_t(r_row, g_row, b_row) + float32x4x3_t(r_row0_, g_row0_, b_row0_) } ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x3_t(b_row, g_row, r_row) + float32x4x3_t(b_row0_, g_row0_, r_row0_) } }; - let mut transient: [u8; 24] = [0; 24]; - vst3_u8(transient.as_mut_ptr(), store_rows); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 3); + vst3q_f32(in_place_ptr, store_rows); } cx += 4; diff --git a/src/neon/to_linear.rs b/src/neon/to_linear.rs deleted file mode 100644 index ed0cc8d..0000000 --- a/src/neon/to_linear.rs +++ /dev/null @@ -1,312 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -use crate::gamma_curves::TransferFunction; -use crate::image::ImageConfiguration; -use crate::neon::*; -use crate::{ - load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter, -}; -use std::arch::aarch64::*; - -#[inline(always)] -pub(crate) unsafe fn neon_triple_to_linear( - r: uint32x4_t, - g: uint32x4_t, - b: uint32x4_t, - transfer_function: TransferFunction, -) -> (float32x4_t, float32x4_t, float32x4_t) { - let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32); - let g_f = vmulq_n_f32(vcvtq_f32_u32(g), 1f32 / 255f32); - let b_f = vmulq_n_f32(vcvtq_f32_u32(b), 1f32 / 255f32); - let r_linear = neon_perform_linear_transfer(transfer_function, r_f); - let g_linear = neon_perform_linear_transfer(transfer_function, g_f); - let b_linear = neon_perform_linear_transfer(transfer_function, b_f); - (r_linear, g_linear, b_linear) -} - -pub unsafe fn neon_channels_to_linear( - start_cx: usize, - src: *const u8, - src_offset: usize, - width: u32, - dst: *mut f32, - dst_offset: usize, - transfer_function: TransferFunction, -) -> usize { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let channels = image_configuration.get_channels_count(); - let mut cx = start_cx; - - let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); - - let (x_low_low, y_low_low, z_low_low) = - neon_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function); - - let a_low = vmovl_u8(vget_low_u8(a_chan)); - - if USE_ALPHA { - let a_low_low = - vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x4_t(z_low_low, y_low_low, x_low_low, a_low_low) - } - }; - vst4q_f32(dst_ptr.add(cx * channels), store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x3_t(x_low_low, y_low_low, z_low_low) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x3_t(z_low_low, y_low_low, x_low_low) - } - }; - vst3q_f32(dst_ptr.add(cx * channels), store_rows); - } - - let r_low_high = vmovl_high_u16(r_low); - let g_low_high = vmovl_high_u16(g_low); - let b_low_high = vmovl_high_u16(b_low); - - let (x_low_high, y_low_high, z_low_high) = - neon_triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function); - - if USE_ALPHA { - let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x4_t(z_low_high, y_low_high, x_low_high, a_low_high) - } - }; - vst4q_f32(dst_ptr.add(cx * channels + 4 * channels), store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x3_t(x_low_high, y_low_high, z_low_high) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x3_t(z_low_high, y_low_high, x_low_high) - } - }; - vst3q_f32(dst_ptr.add(cx * channels + 4 * channels), store_rows); - } - - let r_high = vmovl_high_u8(r_chan); - let g_high = vmovl_high_u8(g_chan); - let b_high = vmovl_high_u8(b_chan); - - let r_high_low = vmovl_u16(vget_low_u16(r_high)); - let g_high_low = vmovl_u16(vget_low_u16(g_high)); - let b_high_low = vmovl_u16(vget_low_u16(b_high)); - - let (x_high_low, y_high_low, z_high_low) = - neon_triple_to_linear(r_high_low, g_high_low, b_high_low, transfer_function); - - let a_high = vmovl_high_u8(a_chan); - - if USE_ALPHA { - let a_high_low = vmulq_n_f32( - vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_high))), - 1f32 / 255f32, - ); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x4_t(x_high_low, y_high_low, z_high_low, a_high_low) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x4_t(z_high_low, y_high_low, x_high_low, a_high_low) - } - }; - vst4q_f32(dst_ptr.add(cx * channels + 4 * channels * 2), store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x3_t(x_high_low, y_high_low, z_high_low) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x3_t(z_high_low, y_high_low, x_high_low) - } - }; - vst3q_f32(dst_ptr.add(cx * channels + 4 * channels * 2), store_rows); - } - - let r_high_high = vmovl_high_u16(r_high); - let g_high_high = vmovl_high_u16(g_high); - let b_high_high = vmovl_high_u16(b_high); - - let (x_high_high, y_high_high, z_high_high) = - neon_triple_to_linear(r_high_high, g_high_high, b_high_high, transfer_function); - - if USE_ALPHA { - let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x4_t(x_high_high, y_high_high, z_high_high, a_high_high) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x4_t(z_high_high, y_high_high, x_high_high, a_high_high) - } - }; - vst4q_f32(dst_ptr.add(cx * channels + 4 * channels * 3), store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x3_t(x_high_high, y_high_high, z_high_high) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x3_t(z_high_high, y_high_high, x_high_high) - } - }; - vst3q_f32(dst_ptr.add(cx * channels + 4 * channels * 3), store_rows); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_half!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); - - let (x_low_low, y_low_low, z_low_low) = - neon_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function); - - let a_low = vmovl_u8(vget_low_u8(a_chan)); - - if USE_ALPHA { - let a_low_low = - vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x4_t(z_low_low, y_low_low, x_low_low, a_low_low) - } - }; - vst4q_f32(dst_ptr.add(cx * channels), store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x3_t(x_low_low, y_low_low, z_low_low) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x3_t(z_low_low, y_low_low, x_low_low) - } - }; - vst3q_f32(dst_ptr.add(cx * channels), store_rows); - } - - let r_low_high = vmovl_high_u16(r_low); - let g_low_high = vmovl_high_u16(g_low); - let b_low_high = vmovl_high_u16(b_low); - - let (x_low_high, y_low_high, z_low_high) = - neon_triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function); - - if USE_ALPHA { - let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x4_t(z_low_high, y_low_high, x_low_high, a_low_high) - } - }; - vst4q_f32(dst_ptr.add(cx * channels + 4 * channels), store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x3_t(x_low_high, y_low_high, z_low_high) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x3_t(z_low_high, y_low_high, x_low_high) - } - }; - vst3q_f32(dst_ptr.add(cx * channels + 4 * channels), store_rows); - } - - cx += 8; - } - - while cx + 4 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_quarter!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); - - let (x_low_low, y_low_low, z_low_low) = - neon_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function); - - let a_low = vmovl_u8(vget_low_u8(a_chan)); - - if USE_ALPHA { - let a_low_low = - vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x4_t(z_low_low, y_low_low, x_low_low, a_low_low) - } - }; - vst4q_f32(dst_ptr.add(cx * channels), store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - float32x4x3_t(x_low_low, y_low_low, z_low_low) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - float32x4x3_t(z_low_low, y_low_low, x_low_low) - } - }; - vst3q_f32(dst_ptr.add(cx * channels), store_rows); - } - - cx += 4; - } - - cx -} diff --git a/src/neon/to_linear_u8.rs b/src/neon/to_linear_u8.rs deleted file mode 100644 index bdd15a3..0000000 --- a/src/neon/to_linear_u8.rs +++ /dev/null @@ -1,256 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -use crate::image::ImageConfiguration; -use crate::neon::{neon_perform_gamma_transfer, neon_perform_linear_transfer}; -use crate::{ - load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter, - TransferFunction, -}; -use std::arch::aarch64::*; - -#[inline(always)] -pub(crate) unsafe fn neon_triple_to_linear_u8( - r: uint32x4_t, - g: uint32x4_t, - b: uint32x4_t, - transfer_function: TransferFunction, -) -> (uint32x4_t, uint32x4_t, uint32x4_t) { - let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32); - let g_f = vmulq_n_f32(vcvtq_f32_u32(g), 1f32 / 255f32); - let b_f = vmulq_n_f32(vcvtq_f32_u32(b), 1f32 / 255f32); - let r_linear = vmulq_n_f32( - match INTO_LINEAR { - true => neon_perform_linear_transfer(transfer_function, r_f), - false => neon_perform_gamma_transfer(transfer_function, r_f), - }, - 255f32, - ); - let g_linear = vmulq_n_f32( - match INTO_LINEAR { - true => neon_perform_linear_transfer(transfer_function, g_f), - false => neon_perform_gamma_transfer(transfer_function, g_f), - }, - 255f32, - ); - let b_linear = vmulq_n_f32( - match INTO_LINEAR { - true => neon_perform_linear_transfer(transfer_function, b_f), - false => neon_perform_gamma_transfer(transfer_function, b_f), - }, - 255f32, - ); - - ( - vcvtaq_u32_f32(r_linear), - vcvtaq_u32_f32(g_linear), - vcvtaq_u32_f32(b_linear), - ) -} - -#[inline] -pub unsafe fn neon_channels_to_linear_u8< - const CHANNELS_CONFIGURATION: u8, - const USE_ALPHA: bool, - const INTO_LINEAR: bool, ->( - start_cx: usize, - src: *const u8, - src_offset: usize, - width: u32, - dst: *mut u8, - dst_offset: usize, - transfer_function: TransferFunction, -) -> usize { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let channels = image_configuration.get_channels_count(); - let mut cx = start_cx; - - let dst_ptr = dst.add(dst_offset); - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); - - let (x_low_low, y_low_low, z_low_low) = neon_triple_to_linear_u8::( - r_low_low, - g_low_low, - b_low_low, - transfer_function, - ); - - let r_low_high = vmovl_high_u16(r_low); - let g_low_high = vmovl_high_u16(g_low); - let b_low_high = vmovl_high_u16(b_low); - - let (x_low_high, y_low_high, z_low_high) = neon_triple_to_linear_u8::( - r_low_high, - g_low_high, - b_low_high, - transfer_function, - ); - - let r_high = vmovl_high_u8(r_chan); - let g_high = vmovl_high_u8(g_chan); - let b_high = vmovl_high_u8(b_chan); - - let r_high_low = vmovl_u16(vget_low_u16(r_high)); - let g_high_low = vmovl_u16(vget_low_u16(g_high)); - let b_high_low = vmovl_u16(vget_low_u16(b_high)); - - let (x_high_low, y_high_low, z_high_low) = neon_triple_to_linear_u8::( - r_high_low, - g_high_low, - b_high_low, - transfer_function, - ); - - let r_high_high = vmovl_high_u16(r_high); - let g_high_high = vmovl_high_u16(g_high); - let b_high_high = vmovl_high_u16(b_high); - - let (x_high_high, y_high_high, z_high_high) = neon_triple_to_linear_u8::( - r_high_high, - g_high_high, - b_high_high, - transfer_function, - ); - - let r_u_norm = vcombine_u8( - vqmovn_u16(vcombine_u16(vmovn_u32(x_low_low), vmovn_u32(x_low_high))), - vqmovn_u16(vcombine_u16(vmovn_u32(x_high_low), vmovn_u32(x_high_high))), - ); - - let g_u_norm = vcombine_u8( - vqmovn_u16(vcombine_u16(vmovn_u32(y_low_low), vmovn_u32(y_low_high))), - vqmovn_u16(vcombine_u16(vmovn_u32(y_high_low), vmovn_u32(y_high_high))), - ); - - let b_u_norm = vcombine_u8( - vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), vmovn_u32(z_low_high))), - vqmovn_u16(vcombine_u16(vmovn_u32(z_high_low), vmovn_u32(z_high_high))), - ); - - if USE_ALPHA { - let v_4 = uint8x16x4_t(r_u_norm, g_u_norm, b_u_norm, a_chan); - vst4q_u8(dst_ptr.add(cx * channels), v_4); - } else { - let v_4 = uint8x16x3_t(r_u_norm, g_u_norm, b_u_norm); - vst3q_u8(dst_ptr.add(cx * channels), v_4); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_half!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); - - let (x_low_low, y_low_low, z_low_low) = neon_triple_to_linear_u8::( - r_low_low, - g_low_low, - b_low_low, - transfer_function, - ); - - let r_low_high = vmovl_high_u16(r_low); - let g_low_high = vmovl_high_u16(g_low); - let b_low_high = vmovl_high_u16(b_low); - - let (x_low_high, y_low_high, z_low_high) = neon_triple_to_linear_u8::( - r_low_high, - g_low_high, - b_low_high, - transfer_function, - ); - - let r_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(x_low_low), vmovn_u32(x_low_high))); - - let g_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(y_low_low), vmovn_u32(y_low_high))); - - let b_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), vmovn_u32(z_low_high))); - - let dst = dst_ptr.add(cx * channels); - - if USE_ALPHA { - let v_4 = uint8x8x4_t(r_u_norm, g_u_norm, b_u_norm, vget_low_u8(a_chan)); - vst4_u8(dst, v_4); - } else { - let v_4 = uint8x8x3_t(r_u_norm, g_u_norm, b_u_norm); - vst3_u8(dst, v_4); - } - - cx += 8; - } - - while cx + 4 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_quarter!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); - - let (x_low_low, y_low_low, z_low_low) = neon_triple_to_linear_u8::( - r_low_low, - g_low_low, - b_low_low, - transfer_function, - ); - - let zeros = vdup_n_u16(0); - - let r_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(x_low_low), zeros)); - - let g_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(y_low_low), zeros)); - - let b_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), zeros)); - - let dst = dst_ptr.add(cx * channels); - - if USE_ALPHA { - let v_4 = uint8x8x4_t(r_u_norm, g_u_norm, b_u_norm, vget_low_u8(a_chan)); - let mut transient: [u8; 32] = [0; 32]; - vst4_u8(transient.as_mut_ptr(), v_4); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst, 4 * 4); - } else { - let v_3 = uint8x8x3_t(r_u_norm, g_u_norm, b_u_norm); - let mut transient: [u8; 24] = [0; 24]; - vst3_u8(transient.as_mut_ptr(), v_3); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst, 4 * 3); - } - - cx += 4; - } - - cx -} diff --git a/src/neon/to_xyz_lab.rs b/src/neon/to_xyz_lab.rs index a1df476..4ad96bd 100644 --- a/src/neon/to_xyz_lab.rs +++ b/src/neon/to_xyz_lab.rs @@ -5,15 +5,12 @@ * // license that can be found in the LICENSE file. */ -use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; +use crate::load_f32_and_deinterleave; use crate::neon::cie::{ neon_triple_to_lab, neon_triple_to_lch, neon_triple_to_luv, neon_triple_to_xyz, }; use crate::xyz_target::XyzTarget; -use crate::{ - load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter, -}; use std::arch::aarch64::*; #[inline(always)] @@ -23,7 +20,7 @@ pub unsafe fn neon_channels_to_xyz_or_lab< const TARGET: u8, >( start_cx: usize, - src: *const u8, + src: *const f32, src_offset: usize, width: u32, dst: *mut f32, @@ -31,7 +28,6 @@ pub unsafe fn neon_channels_to_xyz_or_lab< a_linearized: *mut f32, a_offset: usize, matrix: &[[f32; 3]; 3], - transfer_function: TransferFunction, ) -> usize { if USE_ALPHA && a_linearized.is_null() { panic!("Null alpha channel with requirements of linearized alpha if not supported"); @@ -53,364 +49,15 @@ pub unsafe fn neon_channels_to_xyz_or_lab< let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); - - let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = a; - z_low_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = u; - z_low_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = c; - z_low_low = h; - } - } - - let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low); - vst3q_f32(dst_ptr.add(cx * 3), xyz_low_low); - - let r_low_high = vmovl_high_u16(r_low); - let g_low_high = vmovl_high_u16(g_low); - let b_low_high = vmovl_high_u16(b_low); - - let (mut x_low_high, mut y_low_high, mut z_low_high) = neon_triple_to_xyz( - r_low_high, - g_low_high, - b_low_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high); - vst3q_f32(dst_ptr.add(cx * 3 + 4 * 3), xyz_low_low); - - let r_high = vmovl_high_u8(r_chan); - let g_high = vmovl_high_u8(g_chan); - let b_high = vmovl_high_u8(b_chan); - - let r_high_low = vmovl_u16(vget_low_u16(r_high)); - let g_high_low = vmovl_u16(vget_low_u16(g_high)); - let b_high_low = vmovl_u16(vget_low_u16(b_high)); - - let (mut x_high_low, mut y_high_low, mut z_high_low) = neon_triple_to_xyz( - r_high_low, - g_high_low, - b_high_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = a; - z_high_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = u; - z_high_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = c; - z_high_low = h; - } - } - - let xyz_low_low = float32x4x3_t(x_high_low, y_high_low, z_high_low); - vst3q_f32(dst_ptr.add(cx * 3 + 4 * 3 * 2), xyz_low_low); - - let r_high_high = vmovl_high_u16(r_high); - let g_high_high = vmovl_high_u16(g_high); - let b_high_high = vmovl_high_u16(b_high); - - let (mut x_high_high, mut y_high_high, mut z_high_high) = neon_triple_to_xyz( - r_high_high, - g_high_high, - b_high_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = a; - z_high_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = u; - z_high_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = c; - z_high_high = h; - } - } - - let xyz_low_low = float32x4x3_t(x_high_high, y_high_high, z_high_high); - vst3q_f32(dst_ptr.add(cx * 3 + 4 * 3 * 3), xyz_low_low); - - if USE_ALPHA { - let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32; - - let a_low = vmovl_u8(vget_low_u8(a_chan)); - - let a_low_low = - vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32); - - vst1q_f32(a_ptr.add(cx), a_low_low); - - let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32); - - vst1q_f32(a_ptr.add(cx + 4), a_low_high); - - let a_high = vmovl_high_u8(a_chan); - - let a_high_low = vmulq_n_f32( - vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_high))), - 1f32 / 255f32, - ); - - vst1q_f32(a_ptr.add(cx + 4 * 2), a_high_low); - - let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32); - - vst1q_f32(a_ptr.add(cx + 4 * 3), a_high_high); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_half!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); - - let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = a; - z_low_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = u; - z_low_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = c; - z_low_low = h; - } - } - - let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low); - vst3q_f32(dst_ptr.add(cx * 3), xyz_low_low); - - let r_low_high = vmovl_high_u16(r_low); - let g_low_high = vmovl_high_u16(g_low); - let b_low_high = vmovl_high_u16(b_low); - - let (mut x_low_high, mut y_low_high, mut z_low_high) = neon_triple_to_xyz( - r_low_high, - g_low_high, - b_low_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high); - vst3q_f32(dst_ptr.add(cx * 3 + 4 * 3), xyz_low_low); - - if USE_ALPHA { - let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32; - - let a_low = vmovl_u8(vget_low_u8(a_chan)); - - let a_low_low = - vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32); - - vst1q_f32(a_ptr.add(cx), a_low_low); - - let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32); - - vst1q_f32(a_ptr.add(cx + 4), a_low_high); - } - - cx += 8; - } - while cx + 4 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); + let src_ptr = ((src as * const u8).add(src_offset) as *const f32).add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_quarter!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); + load_f32_and_deinterleave!(src_ptr, image_configuration); let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, + r_chan, + g_chan, + b_chan, cq1, cq2, cq3, @@ -420,7 +67,6 @@ pub unsafe fn neon_channels_to_xyz_or_lab< cq7, cq8, cq9, - transfer_function, ); match target { @@ -451,16 +97,12 @@ pub unsafe fn neon_channels_to_xyz_or_lab< if USE_ALPHA { let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32; - let a_low = vmovl_u8(vget_low_u8(a_chan)); - - let a_low_low = - vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32); - - vst1q_f32(a_ptr.add(cx), a_low_low); + vst1q_f32(a_ptr.add(cx), a_chan); } cx += 4; } + cx } diff --git a/src/neon/to_xyza_laba.rs b/src/neon/to_xyza_laba.rs index 8e93f19..0542683 100644 --- a/src/neon/to_xyza_laba.rs +++ b/src/neon/to_xyza_laba.rs @@ -5,27 +5,23 @@ * // license that can be found in the LICENSE file. */ -use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; use crate::neon::cie::{ neon_triple_to_lab, neon_triple_to_lch, neon_triple_to_luv, neon_triple_to_xyz, }; use crate::xyz_target::XyzTarget; -use crate::{ - load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter, -}; +use crate::load_f32_and_deinterleave; use std::arch::aarch64::*; #[inline(always)] pub unsafe fn neon_channels_to_xyza_or_laba( start_cx: usize, - src: *const u8, + src: *const f32, src_offset: usize, width: u32, dst: *mut f32, dst_offset: usize, matrix: &[[f32; 3]; 3], - transfer_function: TransferFunction, ) -> usize { let target: XyzTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -44,351 +40,13 @@ pub unsafe fn neon_channels_to_xyza_or_laba { - let (l, a, b) = neon_triple_to_lab(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = a; - z_low_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = c; - z_low_low = h; - } - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = u; - z_low_low = v; - } - } - - let a_low = vmovl_u8(vget_low_u8(a_chan)); - - let a_low_low = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32); - - let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low); - vst4q_f32(dst_ptr.add(cx * 4), xyz_low_low); - - let r_low_high = vmovl_high_u16(r_low); - let g_low_high = vmovl_high_u16(g_low); - let b_low_high = vmovl_high_u16(b_low); - - let (mut x_low_high, mut y_low_high, mut z_low_high) = neon_triple_to_xyz( - r_low_high, - g_low_high, - b_low_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32); - - let xyz_low_low = float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high); - vst4q_f32(dst_ptr.add(cx * 4 + 4 * 4), xyz_low_low); - - let r_high = vmovl_high_u8(r_chan); - let g_high = vmovl_high_u8(g_chan); - let b_high = vmovl_high_u8(b_chan); - - let r_high_low = vmovl_u16(vget_low_u16(r_high)); - let g_high_low = vmovl_u16(vget_low_u16(g_high)); - let b_high_low = vmovl_u16(vget_low_u16(b_high)); - - let (mut x_high_low, mut y_high_low, mut z_high_low) = neon_triple_to_xyz( - r_high_low, - g_high_low, - b_high_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = a; - z_high_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = u; - z_high_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = c; - z_high_low = h; - } - } - - let a_high = vmovl_high_u8(a_chan); - let a_high_low = vmulq_n_f32( - vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_high))), - 1f32 / 255f32, - ); - - let xyz_low_low = float32x4x4_t(x_high_low, y_high_low, z_high_low, a_high_low); - vst4q_f32(dst_ptr.add(cx * 4 + 4 * 4 * 2), xyz_low_low); - - let r_high_high = vmovl_high_u16(r_high); - let g_high_high = vmovl_high_u16(g_high); - let b_high_high = vmovl_high_u16(b_high); - - let (mut x_high_high, mut y_high_high, mut z_high_high) = neon_triple_to_xyz( - r_high_high, - g_high_high, - b_high_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = a; - z_high_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = u; - z_high_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = c; - z_high_high = h; - } - } - - let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32); - - let xyz_low_low = float32x4x4_t(x_high_high, y_high_high, z_high_high, a_high_high); - vst4q_f32(dst_ptr.add(cx * 4 + 4 * 4 * 3), xyz_low_low); - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_half!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); - - let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = a; - z_low_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = c; - z_low_low = h; - } - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = u; - z_low_low = v; - } - } - - let a_low = vmovl_u8(vget_low_u8(a_chan)); - - let a_low_low = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32); - - let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low); - vst4q_f32(dst_ptr.add(cx * 4), xyz_low_low); - - let r_low_high = vmovl_high_u16(r_low); - let g_low_high = vmovl_high_u16(g_low); - let b_low_high = vmovl_high_u16(b_low); - - let (mut x_low_high, mut y_low_high, mut z_low_high) = neon_triple_to_xyz( - r_low_high, - g_low_high, - b_low_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = neon_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32); - - let xyz_low_low = float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high); - vst4q_f32(dst_ptr.add(cx * 4 + 4 * 4), xyz_low_low); - - cx += 8; - } - while cx + 4 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); + let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_quarter!(src_ptr, image_configuration); - - let r_low = vmovl_u8(vget_low_u8(r_chan)); - let g_low = vmovl_u8(vget_low_u8(g_chan)); - let b_low = vmovl_u8(vget_low_u8(b_chan)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - let g_low_low = vmovl_u16(vget_low_u16(g_low)); - let b_low_low = vmovl_u16(vget_low_u16(b_low)); + load_f32_and_deinterleave!(src_ptr, image_configuration); let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, + r_chan, g_chan, b_chan, cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9, ); match target { @@ -413,11 +71,7 @@ pub unsafe fn neon_channels_to_xyza_or_laba( src: *const f32, - transfer_function: TransferFunction, c1: float32x4_t, c2: float32x4_t, c3: float32x4_t, @@ -30,9 +27,8 @@ pub(crate) unsafe fn neon_xyz_lab_vld< c7: float32x4_t, c8: float32x4_t, c9: float32x4_t, -) -> (uint32x4_t, uint32x4_t, uint32x4_t) { +) -> (float32x4_t, float32x4_t, float32x4_t) { let target: XyzTarget = TARGET.into(); - let v_scale_color = vdupq_n_f32(255f32); let lab_pixel = vld3q_f32(src); let (mut r_f32, mut g_f32, mut b_f32) = (lab_pixel.0, lab_pixel.1, lab_pixel.2); @@ -61,22 +57,7 @@ pub(crate) unsafe fn neon_xyz_lab_vld< let (linear_r, linear_g, linear_b) = vcolorq_matrix_f32(r_f32, g_f32, b_f32, c1, c2, c3, c4, c5, c6, c7, c8, c9); - r_f32 = linear_r; - g_f32 = linear_g; - b_f32 = linear_b; - - r_f32 = neon_perform_gamma_transfer(transfer_function, r_f32); - g_f32 = neon_perform_gamma_transfer(transfer_function, g_f32); - b_f32 = neon_perform_gamma_transfer(transfer_function, b_f32); - - r_f32 = vmulq_f32(r_f32, v_scale_color); - g_f32 = vmulq_f32(g_f32, v_scale_color); - b_f32 = vmulq_f32(b_f32, v_scale_color); - ( - vcvtaq_u32_f32(r_f32), - vcvtaq_u32_f32(g_f32), - vcvtaq_u32_f32(b_f32), - ) + (linear_r, linear_g, linear_b) } #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] @@ -91,11 +72,10 @@ pub unsafe fn neon_xyz_to_channels< src_offset: usize, a_channel: *const f32, a_offset: usize, - dst: *mut u8, + dst: *mut f32, dst_offset: usize, width: u32, matrix: &[[f32; 3]; 3], - transfer_function: TransferFunction, ) -> usize { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); if USE_ALPHA && !image_configuration.has_alpha() { @@ -118,133 +98,6 @@ pub unsafe fn neon_xyz_to_channels< let src_channels = 3usize; - while cx + 16 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_) = - neon_xyz_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * src_channels); - - let (r_row1_, g_row1_, b_row1_) = - neon_xyz_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * src_channels); - - let (r_row2_, g_row2_, b_row2_) = - neon_xyz_lab_vld::( - src_ptr_2, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_3 = offset_src_ptr.add(4 * 3 * src_channels); - - let (r_row3_, g_row3_, b_row3_) = - neon_xyz_lab_vld::( - src_ptr_3, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - - let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_)); - let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_)); - let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_)); - - let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23)); - let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23)); - let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23)); - - let dst_ptr = dst.add(dst_offset + cx * channels); - - if USE_ALPHA { - let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); - let a_low_0_f = vld1q_f32(offset_a_src_ptr); - let a_row0_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_0_f, 255f32)); - - let a_low_1_f = vld1q_f32(offset_a_src_ptr.add(4)); - let a_row1_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_1_f, 255f32)); - - let a_low_2_f = vld1q_f32(offset_a_src_ptr.add(8)); - let a_row2_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_2_f, 255f32)); - - let a_low_3_f = vld1q_f32(offset_a_src_ptr.add(12)); - let a_row3_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_3_f, 255f32)); - - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_)); - let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23)); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x16x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x16x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4q_u8(dst_ptr, store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x16x3_t(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x16x3_t(b_row, g_row, r_row) - } - }; - vst3q_u8(dst_ptr, store_rows); - } - - cx += 16; - } - while cx + 4 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels); @@ -253,160 +106,33 @@ pub unsafe fn neon_xyz_to_channels< let (r_row0_, g_row0_, b_row0_) = neon_xyz_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * src_channels); - - let (r_row1_, g_row1_, b_row1_) = - neon_xyz_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, + src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ); - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - - let r_row = vqmovn_u16(r_row01); - let g_row = vqmovn_u16(g_row01); - let b_row = vqmovn_u16(b_row01); - - let dst_ptr = dst.add(dst_offset + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels); if USE_ALPHA { let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); - let a_low_0_f = vld1q_f32(offset_a_src_ptr); - let a_row0_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_0_f, 255f32)); - - let a_low_1_f = vld1q_f32(offset_a_src_ptr.add(4)); - let a_row1_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_1_f, 255f32)); - - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - let a_row = vqmovn_u16(a_row01); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4_u8(dst_ptr, store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x3_t(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x3_t(b_row, g_row, r_row) - } - }; - vst3_u8(dst_ptr, store_rows); - } - - cx += 8; - } - while cx + 8 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_) = - neon_xyz_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * src_channels); - - let (r_row1_, g_row1_, b_row1_) = - neon_xyz_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - - let r_row = vqmovn_u16(r_row01); - let g_row = vqmovn_u16(g_row01); - let b_row = vqmovn_u16(b_row01); - - let dst_ptr = dst.add(dst_offset + cx * channels); - - if USE_ALPHA { - let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); - let a_low_0_f = vld1q_f32(offset_a_src_ptr); - let a_row0_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_0_f, 255f32)); - - let a_low_1_f = vld1q_f32(offset_a_src_ptr.add(4)); - let a_row1_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_1_f, 255f32)); - - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - let a_row = vqmovn_u16(a_row01); + let a_row = vld1q_f32(offset_a_src_ptr); let store_rows = match image_configuration { ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x4_t(r_row, g_row, b_row, a_row) + float32x4x4_t(r_row0_, g_row0_, b_row0_, a_row) } ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) + float32x4x4_t(b_row0_, g_row0_, r_row0_, a_row) } }; - let mut transient: [u8; 32] = [0; 32]; - vst4_u8(transient.as_mut_ptr(), store_rows); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4); + vst4q_f32(dst_ptr, store_rows); } else { let store_rows = match image_configuration { ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x3_t(r_row, g_row, b_row) + float32x4x3_t(r_row0_, g_row0_, b_row0_) } ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x3_t(b_row, g_row, r_row) + float32x4x3_t(b_row0_, g_row0_, r_row0_) } }; - let mut transient: [u8; 24] = [0; 24]; - vst3_u8(transient.as_mut_ptr(), store_rows); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 3); + vst3q_f32(dst_ptr, store_rows); } cx += 4; diff --git a/src/neon/xyza_laba_to_image.rs b/src/neon/xyza_laba_to_image.rs index c1e0062..ed829ea 100644 --- a/src/neon/xyza_laba_to_image.rs +++ b/src/neon/xyza_laba_to_image.rs @@ -8,15 +8,12 @@ use crate::image::ImageConfiguration; use crate::neon::cie::{neon_lab_to_xyz, neon_lch_to_xyz, neon_luv_to_xyz}; use crate::neon::math::vcolorq_matrix_f32; -use crate::neon::*; use crate::xyz_target::XyzTarget; -use crate::TransferFunction; use std::arch::aarch64::*; #[inline(always)] pub(crate) unsafe fn neon_xyza_lab_vld( src: *const f32, - transfer_function: TransferFunction, c1: float32x4_t, c2: float32x4_t, c3: float32x4_t, @@ -26,9 +23,8 @@ pub(crate) unsafe fn neon_xyza_lab_vld (uint32x4_t, uint32x4_t, uint32x4_t, uint32x4_t) { +) -> (float32x4_t, float32x4_t, float32x4_t, float32x4_t) { let target: XyzTarget = TARGET.into(); - let v_scale_color = vdupq_n_f32(255f32); let lab_pixel = vld4q_f32(src); let (mut r_f32, mut g_f32, mut b_f32) = (lab_pixel.0, lab_pixel.1, lab_pixel.2); @@ -57,23 +53,7 @@ pub(crate) unsafe fn neon_xyza_lab_vld usize { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); if !image_configuration.has_alpha() { @@ -108,169 +87,6 @@ pub unsafe fn neon_xyza_to_image( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - neon_xyza_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * CHANNELS); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = - neon_xyza_lab_vld::( - src_ptr_2, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_3 = offset_src_ptr.add(4 * 3 * CHANNELS); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = - neon_xyza_lab_vld::( - src_ptr_3, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - - let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_)); - let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_)); - let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_)); - let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_)); - - let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23)); - let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23)); - let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23)); - let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23)); - - let dst_ptr = dst.add(dst_offset + cx * channels); - - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x16x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x16x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4q_u8(dst_ptr, store_rows); - - cx += 16; - } - - while cx + 8 < width as usize { - let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = - neon_xyza_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - neon_xyza_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - - let r_row = vqmovn_u16(r_row01); - let g_row = vqmovn_u16(g_row01); - let b_row = vqmovn_u16(b_row01); - let a_row = vqmovn_u16(a_row01); - - let dst_ptr = dst.add(dst_offset + cx * channels); - - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4_u8(dst_ptr, store_rows); - - cx += 8; - } - while cx + 4 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); @@ -279,7 +95,6 @@ pub unsafe fn neon_xyza_to_image( src_ptr_0, - transfer_function, c1, c2, c3, @@ -291,31 +106,17 @@ pub unsafe fn neon_xyza_to_image { - uint8x8x4_t(r_row, g_row, b_row, a_row) + float32x4x4_t(r_row0_, g_row0_, b_row0_, a_row0_) } ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) + float32x4x4_t(b_row0_, g_row0_, r_row0_, a_row0_) } }; - let mut transient: [u8; 32] = [0; 32]; - vst4_u8(transient.as_mut_ptr(), store_rows); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4); + vst4q_f32(dst_ptr, store_rows); cx += 4; } diff --git a/src/oklab.rs b/src/oklab.rs index e2e03aa..a4986b9 100644 --- a/src/oklab.rs +++ b/src/oklab.rs @@ -42,10 +42,16 @@ impl Oklab { Self::linear_rgb_to_oklab(linearized) } + #[inline] + /// Convert Linear Rgb to [Oklab] + pub fn from_linear_rgb(rgb: Rgb) -> Oklab { + Self::linear_rgb_to_oklab(rgb) + } + #[inline] /// Converts [Oklab] to [Rgb] using sRGB transfer function pub fn to_srgb(&self) -> Rgb { - let linear_rgb = self.to_linear_srgb(); + let linear_rgb = self.to_linear_rgb(); let transferred = linear_rgb.gamma(TransferFunction::Srgb); transferred.to_u8() } @@ -53,7 +59,7 @@ impl Oklab { #[inline] /// Converts [Oklab] to [Rgb] using provided [TransferFunction] pub fn to_rgb(&self, transfer_function: TransferFunction) -> Rgb { - let linear_rgb = self.to_linear_srgb(); + let linear_rgb = self.to_linear_rgb(); let transferred = linear_rgb.gamma(transfer_function); transferred.to_u8() } @@ -61,14 +67,14 @@ impl Oklab { #[inline] /// Converts [Oklab] to linear [Rgb] using sRGB transfer function pub fn to_srgb_f32(&self) -> Rgb { - let linear_rgb = self.to_linear_srgb(); + let linear_rgb = self.to_linear_rgb(); linear_rgb.gamma(TransferFunction::Srgb) } #[inline] /// Converts [Oklab] to [Rgb] using provided [TransferFunction] pub fn to_rgb_f32(&self, transfer_function: TransferFunction) -> Rgb { - let linear_rgb = self.to_linear_srgb(); + let linear_rgb = self.to_linear_rgb(); linear_rgb.gamma(transfer_function) } @@ -91,7 +97,7 @@ impl Oklab { #[inline] /// Converts to linear RGB - pub fn to_linear_srgb(&self) -> Rgb { + pub fn to_linear_rgb(&self) -> Rgb { let l_ = self.l + 0.3963377774f32 * self.a + 0.2158037573f32 * self.b; let m_ = self.l - 0.1055613458f32 * self.a - 0.0638541728f32 * self.b; let s_ = self.l - 0.0894841775f32 * self.a - 1.2914855480f32 * self.b; diff --git a/src/oklab_to_image.rs b/src/oklab_to_image.rs index 157075e..35377dd 100644 --- a/src/oklab_to_image.rs +++ b/src/oklab_to_image.rs @@ -13,12 +13,11 @@ use crate::neon::neon_oklab_to_image; use crate::oklch::Oklch; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::sse_oklab_to_image; -use crate::{Oklab, TransferFunction}; +use crate::{Oklab, Rgb, TransferFunction}; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[allow(clippy::type_complexity)] @@ -35,7 +34,7 @@ fn oklab_to_image( let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); let mut _wide_row_handle: Option< - unsafe fn(usize, *const f32, u32, *mut u8, u32, u32, TransferFunction) -> usize, + unsafe fn(usize, *const f32, usize, *mut f32, u32, u32) -> usize, > = None; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -53,34 +52,35 @@ fn oklab_to_image( _wide_row_handle = Some(neon_oklab_to_image::); } + let mut lut_table = vec![0u8; 2049]; + for i in 0..2049 { + lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.) + .ceil() + .min(255.) as u8; + } + + let channels = image_configuration.get_channels_count(); + + let src_slice_safe_align = unsafe { + slice::from_raw_parts( + src.as_ptr() as *const u8, + src_stride as usize * height as usize, + ) + }; + #[cfg(feature = "rayon")] { - let src_slice_safe_align = unsafe { - slice::from_raw_parts( - src.as_ptr() as *const u8, - src_stride as usize * height as usize, - ) - }; dst.par_chunks_exact_mut(dst_stride as usize) .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) .for_each(|(dst, src)| unsafe { - let channels = image_configuration.get_channels_count(); - let mut _cx = 0usize; + let mut transient_row = vec![0f32; width as usize * channels]; + let src_ptr = src.as_ptr() as *mut f32; - let dst_ptr = dst.as_mut_ptr(); if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher( - _cx, - src.as_ptr() as *const f32, - 0, - dst.as_mut_ptr(), - 0, - width, - transfer_function, - ) + _cx = dispatcher(_cx, src_ptr, 0, transient_row.as_mut_ptr(), 0, width) } for x in _cx..width as usize { @@ -92,26 +92,38 @@ fn oklab_to_image( let rgb = match target { OklabTarget::Oklab => { let oklab = Oklab::new(l_x, l_y, l_z); - oklab.to_rgb(transfer_function) + oklab.to_linear_rgb() } OklabTarget::Oklch => { let oklch = Oklch::new(l_x, l_y, l_z); - oklch.to_rgb(transfer_function) + oklch.to_linear_rgb() } }; - let dst = dst_ptr.add(x * channels); - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); + let v_dst = transient_row.get_unchecked_mut((x * channels)..); + *v_dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *v_dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *v_dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; if image_configuration.has_alpha() { let l_a = source_p.add(3).read_unaligned(); - let a_value = (l_a * 255f32).max(0f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_value as u8); + *v_dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; + } + } + + for (dst_chunks, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact_mut(channels)) + { + let rgb = (Rgb::::new(src_chunks[0], src_chunks[1], src_chunks[2]) + * Rgb::::dup(2048f32)) + .cast::(); + + dst_chunks[0] = *lut_table.get_unchecked(rgb.r as usize); + dst_chunks[1] = *lut_table.get_unchecked(rgb.g as usize); + dst_chunks[2] = *lut_table.get_unchecked(rgb.b as usize); + if image_configuration.has_alpha() { + let a_lin = (src_chunks[4] * 255f32).round() as u8; + dst_chunks[0] = a_lin; } } }); @@ -119,67 +131,63 @@ fn oklab_to_image( #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; - - let channels = image_configuration.get_channels_count(); - - for _ in 0..height as usize { - let mut _cx = 0usize; - - let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 }; - let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) }; - - if let Some(dispatcher) = _wide_row_handle { - unsafe { - _cx = dispatcher( - _cx, - src.as_ptr(), - src_offset as u32, - dst.as_mut_ptr(), - dst_offset as u32, - width, - transfer_function, - ) + for (dst, src) in dst.chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) { + unsafe { + let mut _cx = 0usize; + + let mut transient_row = vec![0f32; width as usize * channels]; + + let src_ptr = src.as_ptr() as *mut f32; + + if let Some(dispatcher) = _wide_row_handle { + _cx = dispatcher(_cx, src_ptr, 0, transient_row.as_mut_ptr(), 0, width) } - } - for x in _cx..width as usize { - let px = x * channels; - let source_p = unsafe { src_ptr.add(px) }; - let l_x = unsafe { source_p.read_unaligned() }; - let l_y = unsafe { source_p.add(1).read_unaligned() }; - let l_z = unsafe { source_p.add(2).read_unaligned() }; - let rgb = match target { - OklabTarget::Oklab => { - let oklab = Oklab::new(l_x, l_y, l_z); - oklab.to_rgb(transfer_function) - } - OklabTarget::Oklch => { - let oklch = Oklch::new(l_x, l_y, l_z); - oklch.to_rgb(transfer_function) - } - }; - - unsafe { - let dst = dst_ptr.add(x * channels); - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); + for x in _cx..width as usize { + let px = x * channels; + let source_p = src_ptr.add(px); + let l_x = source_p.read_unaligned(); + let l_y = source_p.add(1).read_unaligned(); + let l_z = source_p.add(2).read_unaligned(); + let rgb = match target { + OklabTarget::Oklab => { + let oklab = Oklab::new(l_x, l_y, l_z); + oklab.to_linear_rgb() + } + OklabTarget::Oklch => { + let oklch = Oklch::new(l_x, l_y, l_z); + oklch.to_linear_rgb() + } + }; + + let v_dst = transient_row.get_unchecked_mut((x * channels)..); + *v_dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *v_dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *v_dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; if image_configuration.has_alpha() { let l_a = source_p.add(3).read_unaligned(); - let a_value = (l_a * 255f32).max(0f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_value as u8); + *v_dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; } } - } - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; + for (dst_chunks, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact_mut(channels)) + { + let rgb = (Rgb::::new(src_chunks[0], src_chunks[1], src_chunks[2]) + * Rgb::::dup(2048f32)) + .cast::(); + + dst_chunks[0] = *lut_table.get_unchecked(rgb.r as usize); + dst_chunks[1] = *lut_table.get_unchecked(rgb.g as usize); + dst_chunks[2] = *lut_table.get_unchecked(rgb.b as usize); + if image_configuration.has_alpha() { + let a_lin = (src_chunks[4] * 255f32).round() as u8; + dst_chunks[0] = a_lin; + } + } + } } } } diff --git a/src/oklch.rs b/src/oklch.rs index cea1392..758bb06 100644 --- a/src/oklch.rs +++ b/src/oklch.rs @@ -38,7 +38,17 @@ impl Oklch { Oklch::from_oklab(oklab) } - /// Converts *Oklch* into *Rgb* + /// Converts Linear [Rgb] into [Oklch] + /// + /// # Arguments + /// `transfer_function` - Transfer function into linear colorspace and its inverse + #[inline] + pub fn from_linear_rgb(rgb: Rgb) -> Oklch { + let oklab = Oklab::from_linear_rgb(rgb); + Oklch::from_oklab(oklab) + } + + /// Converts [Oklch] into [Rgb] /// /// # Arguments /// `transfer_function` - Transfer function into linear colorspace and its inverse @@ -48,6 +58,13 @@ impl Oklch { oklab.to_rgb(transfer_function) } + /// Converts [Oklch] into linear [Rgb] + #[inline] + pub fn to_linear_rgb(&self) -> Rgb { + let oklab = self.to_oklab(); + oklab.to_linear_rgb() + } + /// Converts *Oklab* to *Oklch* #[inline] pub fn from_oklab(oklab: Oklab) -> Oklch { diff --git a/src/rgb.rs b/src/rgb.rs index 2639ecc..f234bf6 100644 --- a/src/rgb.rs +++ b/src/rgb.rs @@ -985,3 +985,22 @@ where Rgb::::new(self.r.powf(rhs.r), self.g.powf(rhs.g), self.b.powf(rhs.b)) } } + +impl Rgb { + pub fn cast(self) -> Rgb + where + T: AsPrimitive, + V: Copy + 'static, + { + Rgb::new(self.r.as_(), self.g.as_(), self.b.as_()) + } +} + +impl Rgb +where + T: Float + 'static, +{ + pub fn round(self) -> Rgb { + Rgb::new(self.r.round(), self.g.round(), self.b.round()) + } +} diff --git a/src/sse/cie.rs b/src/sse/cie.rs index 2825467..ff2f349 100644 --- a/src/sse/cie.rs +++ b/src/sse/cie.rs @@ -10,9 +10,8 @@ use crate::luv::{ LUV_WHITE_V_PRIME, }; use crate::sse::{ - _mm_color_matrix_ps, _mm_cube_ps, _mm_prefer_fma_ps, _mm_select_ps, perform_sse_linear_transfer, + _mm_color_matrix_ps, _mm_cube_ps, _mm_prefer_fma_ps, _mm_select_ps, }; -use crate::TransferFunction; use erydanos::{_mm_atan2_ps, _mm_cbrt_fast_ps, _mm_cos_ps, _mm_hypot_ps, _mm_sin_ps}; #[cfg(target_arch = "x86")] use std::arch::x86::*; @@ -20,10 +19,10 @@ use std::arch::x86::*; use std::arch::x86_64::*; #[inline(always)] -pub(crate) unsafe fn sse_triple_to_xyz( - r: __m128i, - g: __m128i, - b: __m128i, +pub unsafe fn sse_triple_to_xyz( + r: __m128, + g: __m128, + b: __m128, c1: __m128, c2: __m128, c3: __m128, @@ -33,24 +32,15 @@ pub(crate) unsafe fn sse_triple_to_xyz( c7: __m128, c8: __m128, c9: __m128, - transfer_function: TransferFunction, ) -> (__m128, __m128, __m128) { - let u8_scale = _mm_set1_ps(1f32 / 255f32); - let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), u8_scale); - let g_f = _mm_mul_ps(_mm_cvtepi32_ps(g), u8_scale); - let b_f = _mm_mul_ps(_mm_cvtepi32_ps(b), u8_scale); - let r_linear = perform_sse_linear_transfer(transfer_function, r_f); - let g_linear = perform_sse_linear_transfer(transfer_function, g_f); - let b_linear = perform_sse_linear_transfer(transfer_function, b_f); - let (x, y, z) = _mm_color_matrix_ps( - r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9, + r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9, ); (x, y, z) } #[inline(always)] -pub(crate) unsafe fn sse_triple_to_luv( +pub unsafe fn sse_triple_to_luv( x: __m128, y: __m128, z: __m128, @@ -80,7 +70,7 @@ pub(crate) unsafe fn sse_triple_to_luv( } #[inline(always)] -pub(crate) unsafe fn sse_triple_to_lab( +pub unsafe fn sse_triple_to_lab( x: __m128, y: __m128, z: __m128, @@ -106,7 +96,7 @@ pub(crate) unsafe fn sse_triple_to_lab( } #[inline(always)] -pub(crate) unsafe fn sse_triple_to_lch( +pub unsafe fn sse_triple_to_lch( x: __m128, y: __m128, z: __m128, @@ -118,7 +108,7 @@ pub(crate) unsafe fn sse_triple_to_lch( } #[inline(always)] -pub(crate) unsafe fn sse_lab_to_xyz(l: __m128, a: __m128, b: __m128) -> (__m128, __m128, __m128) { +pub unsafe fn sse_lab_to_xyz(l: __m128, a: __m128, b: __m128) -> (__m128, __m128, __m128) { let y = _mm_mul_ps( _mm_add_ps(l, _mm_set1_ps(16f32)), _mm_set1_ps(1f32 / 116f32), @@ -144,7 +134,7 @@ pub(crate) unsafe fn sse_lab_to_xyz(l: __m128, a: __m128, b: __m128) -> (__m128, } #[inline(always)] -pub(crate) unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128, __m128, __m128) { +pub unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128, __m128, __m128) { let zeros = _mm_setzero_ps(); let zero_mask = _mm_cmpeq_ps(l, zeros); let l13 = _mm_rcp_ps(_mm_mul_ps(l, _mm_set1_ps(13f32))); @@ -183,7 +173,7 @@ pub(crate) unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128, } #[inline(always)] -pub(crate) unsafe fn sse_lch_to_xyz(l: __m128, c: __m128, h: __m128) -> (__m128, __m128, __m128) { +pub unsafe fn sse_lch_to_xyz(l: __m128, c: __m128, h: __m128) -> (__m128, __m128, __m128) { let u = _mm_mul_ps(c, _mm_cos_ps(h)); let v = _mm_mul_ps(c, _mm_sin_ps(h)); sse_luv_to_xyz(l, u, v) diff --git a/src/sse/image_to_linear_u8.rs b/src/sse/image_to_linear_u8.rs deleted file mode 100644 index 35e73c0..0000000 --- a/src/sse/image_to_linear_u8.rs +++ /dev/null @@ -1,237 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -pub mod sse_image_to_linear_unsigned { - #[cfg(target_arch = "x86")] - use std::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use std::arch::x86_64::*; - - use crate::image::ImageConfiguration; - use crate::sse::*; - use crate::{ - load_u8_and_deinterleave, store_and_interleave_v3_half_u8, store_and_interleave_v3_u8, - store_and_interleave_v4_half_u8, store_and_interleave_v4_u8, TransferFunction, - }; - - #[inline(always)] - unsafe fn sse_triple_to_linear_u8( - r: __m128i, - g: __m128i, - b: __m128i, - transfer_function: TransferFunction, - ) -> (__m128i, __m128i, __m128i) { - let u8_scale = _mm_set1_ps(1f32 / 255f32); - let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), u8_scale); - let g_f = _mm_mul_ps(_mm_cvtepi32_ps(g), u8_scale); - let b_f = _mm_mul_ps(_mm_cvtepi32_ps(b), u8_scale); - let u8_backwards = _mm_set1_ps(255f32); - let r_linear = _mm_mul_ps( - match INTO_LINEAR { - true => perform_sse_linear_transfer(transfer_function, r_f), - false => perform_sse_gamma_transfer(transfer_function, r_f), - }, - u8_backwards, - ); - let g_linear = _mm_mul_ps( - match INTO_LINEAR { - true => perform_sse_linear_transfer(transfer_function, g_f), - false => perform_sse_gamma_transfer(transfer_function, g_f), - }, - u8_backwards, - ); - let b_linear = _mm_mul_ps( - match INTO_LINEAR { - true => perform_sse_linear_transfer(transfer_function, b_f), - false => perform_sse_gamma_transfer(transfer_function, b_f), - }, - u8_backwards, - ); - ( - _mm_cvtps_epi32(r_linear), - _mm_cvtps_epi32(g_linear), - _mm_cvtps_epi32(b_linear), - ) - } - - #[target_feature(enable = "sse4.1")] - pub(crate) unsafe fn sse_channels_to_linear_u8< - const CHANNELS_CONFIGURATION: u8, - const USE_ALPHA: bool, - const INTO_LINEAR: bool, - >( - start_cx: usize, - src: *const u8, - src_offset: usize, - width: u32, - dst: *mut u8, - dst_offset: usize, - transfer_function: TransferFunction, - ) -> usize { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let channels = image_configuration.get_channels_count(); - let mut cx = start_cx; - - let dst_ptr = dst.add(dst_offset); - - let zeros = _mm_setzero_si128(); - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave!(src_ptr, image_configuration); - - let r_low = _mm_cvtepu8_epi16(r_chan); - let g_low = _mm_cvtepu8_epi16(g_chan); - let b_low = _mm_cvtepu8_epi16(b_chan); - - let r_low_low = _mm_cvtepu16_epi32(r_low); - let g_low_low = _mm_cvtepu16_epi32(g_low); - let b_low_low = _mm_cvtepu16_epi32(b_low); - - let (x_low_low, y_low_low, z_low_low) = sse_triple_to_linear_u8::( - r_low_low, - g_low_low, - b_low_low, - transfer_function, - ); - - let r_low_high = _mm_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm_unpackhi_epi16(b_low, zeros); - - let (x_low_high, y_low_high, z_low_high) = sse_triple_to_linear_u8::( - r_low_high, - g_low_high, - b_low_high, - transfer_function, - ); - - let r_high = _mm_unpackhi_epi8(r_chan, zeros); - let g_high = _mm_unpackhi_epi8(g_chan, zeros); - let b_high = _mm_unpackhi_epi8(b_chan, zeros); - - let r_high_low = _mm_cvtepu16_epi32(r_high); - let g_high_low = _mm_cvtepu16_epi32(g_high); - let b_high_low = _mm_cvtepu16_epi32(b_high); - - let (x_high_low, y_high_low, z_high_low) = sse_triple_to_linear_u8::( - r_high_low, - g_high_low, - b_high_low, - transfer_function, - ); - - let r_high_high = _mm_unpackhi_epi16(r_high, zeros); - let g_high_high = _mm_unpackhi_epi16(g_high, zeros); - let b_high_high = _mm_unpackhi_epi16(b_high, zeros); - - let (x_high_high, y_high_high, z_high_high) = sse_triple_to_linear_u8::( - r_high_high, - g_high_high, - b_high_high, - transfer_function, - ); - - let r_u_norm = _mm_packus_epi16( - _mm_packus_epi32(x_low_low, x_low_high), - _mm_packus_epi32(x_high_low, x_high_high), - ); - - let g_u_norm = _mm_packus_epi16( - _mm_packus_epi32(y_low_low, y_low_high), - _mm_packus_epi32(y_high_low, y_high_high), - ); - - let b_u_norm = _mm_packus_epi16( - _mm_packus_epi32(z_low_low, z_low_high), - _mm_packus_epi32(z_high_low, z_high_high), - ); - - let dst = dst_ptr.add(cx * channels); - - if USE_ALPHA { - store_and_interleave_v4_u8!( - dst, - image_configuration, - r_u_norm, - g_u_norm, - b_u_norm, - a_chan - ); - } else { - store_and_interleave_v3_u8!(dst, image_configuration, r_u_norm, g_u_norm, b_u_norm); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave!(src_ptr, image_configuration); - - let r_low = _mm_cvtepu8_epi16(r_chan); - let g_low = _mm_cvtepu8_epi16(g_chan); - let b_low = _mm_cvtepu8_epi16(b_chan); - - let r_low_low = _mm_cvtepu16_epi32(r_low); - let g_low_low = _mm_cvtepu16_epi32(g_low); - let b_low_low = _mm_cvtepu16_epi32(b_low); - - let (x_low_low, y_low_low, z_low_low) = sse_triple_to_linear_u8::( - r_low_low, - g_low_low, - b_low_low, - transfer_function, - ); - - let r_low_high = _mm_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm_unpackhi_epi16(b_low, zeros); - - let (x_low_high, y_low_high, z_low_high) = sse_triple_to_linear_u8::( - r_low_high, - g_low_high, - b_low_high, - transfer_function, - ); - - let r_u_norm = _mm_packus_epi16(_mm_packus_epi32(x_low_low, x_low_high), zeros); - - let g_u_norm = _mm_packus_epi16(_mm_packus_epi32(y_low_low, y_low_high), zeros); - - let b_u_norm = _mm_packus_epi16(_mm_packus_epi32(z_low_low, z_low_high), zeros); - - let dst = dst_ptr.add(cx * channels); - - if USE_ALPHA { - store_and_interleave_v4_half_u8!( - dst, - image_configuration, - r_u_norm, - g_u_norm, - b_u_norm, - a_chan - ); - } else { - store_and_interleave_v3_half_u8!( - dst, - image_configuration, - r_u_norm, - g_u_norm, - b_u_norm - ); - } - - cx += 8; - } - - cx - } -} diff --git a/src/sse/image_to_oklab.rs b/src/sse/image_to_oklab.rs index 897f91f..58befe8 100644 --- a/src/sse/image_to_oklab.rs +++ b/src/sse/image_to_oklab.rs @@ -4,7 +4,7 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ -use crate::sse::perform_sse_linear_transfer; +use crate::sse::{sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps}; use erydanos::{_mm_atan2_ps, _mm_cbrt_fast_ps, _mm_hypot_fast_ps}; #[cfg(target_arch = "x86")] use std::arch::x86::*; @@ -14,29 +14,18 @@ use std::arch::x86_64::*; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; use crate::sse::{ - _mm_color_matrix_ps, sse_deinterleave_rgb, sse_deinterleave_rgba, sse_interleave_ps_rgb, + _mm_color_matrix_ps, sse_interleave_ps_rgb, sse_interleave_ps_rgba, }; -use crate::{ - load_u8_and_deinterleave, load_u8_and_deinterleave_half, store_and_interleave_v3_direct_f32, - store_and_interleave_v4_direct_f32, TransferFunction, -}; +use crate::{load_f32_and_deinterleave, store_and_interleave_v3_direct_f32, store_and_interleave_v4_direct_f32}; macro_rules! triple_to_oklab { - ($r: expr, $g: expr, $b: expr, $transfer: expr, $target: expr, + ($r: expr, $g: expr, $b: expr, $target: expr, $c0:expr, $c1:expr, $c2: expr, $c3: expr, $c4:expr, $c5: expr, $c6:expr, $c7: expr, $c8: expr, $m0: expr, $m1: expr, $m2: expr, $m3: expr, $m4: expr, $m5: expr, $m6: expr, $m7: expr, $m8: expr ) => {{ - let u8_scale = _mm_set1_ps(1f32 / 255f32); - let r_f = _mm_mul_ps(_mm_cvtepi32_ps($r), u8_scale); - let g_f = _mm_mul_ps(_mm_cvtepi32_ps($g), u8_scale); - let b_f = _mm_mul_ps(_mm_cvtepi32_ps($b), u8_scale); - let r_linear = perform_sse_linear_transfer($transfer, r_f); - let g_linear = perform_sse_linear_transfer($transfer, g_f); - let b_linear = perform_sse_linear_transfer($transfer, b_f); - let (l_l, l_m, l_s) = _mm_color_matrix_ps( - r_linear, g_linear, b_linear, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8, + $r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8, ); let l_ = _mm_cbrt_fast_ps(l_l); @@ -60,12 +49,9 @@ macro_rules! triple_to_oklab { #[target_feature(enable = "sse4.1")] pub unsafe fn sse_image_to_oklab( start_cx: usize, - src: *const u8, - src_offset: usize, width: u32, dst: *mut f32, dst_offset: usize, - transfer_function: TransferFunction, ) -> usize { let target: OklabTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -98,217 +84,15 @@ pub unsafe fn sse_image_to_oklab( - src: *const f32, - transfer_function: TransferFunction, -) -> (__m128i, __m128i, __m128i, __m128i) { - let v_scale_alpha = _mm_set1_ps(255f32); - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - - let (mut r_f32, mut g_f32, mut b_f32, mut a_f32) = - load_f32_and_deinterleave!(src, image_configuration); - - r_f32 = perform_sse_gamma_transfer(transfer_function, r_f32); - g_f32 = perform_sse_gamma_transfer(transfer_function, g_f32); - b_f32 = perform_sse_gamma_transfer(transfer_function, b_f32); - r_f32 = _mm_mul_ps(r_f32, v_scale_alpha); - g_f32 = _mm_mul_ps(g_f32, v_scale_alpha); - b_f32 = _mm_mul_ps(b_f32, v_scale_alpha); - if USE_ALPHA { - a_f32 = _mm_mul_ps(a_f32, v_scale_alpha); - } - const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - - if USE_ALPHA { - ( - _mm_cvtps_epi32(_mm_round_ps::(r_f32)), - _mm_cvtps_epi32(_mm_round_ps::(g_f32)), - _mm_cvtps_epi32(_mm_round_ps::(b_f32)), - _mm_cvtps_epi32(_mm_round_ps::(a_f32)), - ) - } else { - ( - _mm_cvtps_epi32(_mm_round_ps::(r_f32)), - _mm_cvtps_epi32(_mm_round_ps::(g_f32)), - _mm_cvtps_epi32(_mm_round_ps::(b_f32)), - _mm_set1_epi32(255), - ) - } -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn sse_linear_to_gamma( - start_cx: usize, - src: *const f32, - src_offset: u32, - dst: *mut u8, - dst_offset: u32, - width: u32, - transfer_function: TransferFunction, -) -> usize { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let channels = image_configuration.get_channels_count(); - let mut cx = start_cx; - - while cx + 16 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = - sse_gamma_vld::(src_ptr_0, transfer_function); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - sse_gamma_vld::(src_ptr_1, transfer_function); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = - sse_gamma_vld::(src_ptr_2, transfer_function); - - let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = - sse_gamma_vld::(src_ptr_3, transfer_function); - - let r_row01 = _mm_packus_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packus_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packus_epi32(b_row0_, b_row1_); - - let r_row23 = _mm_packus_epi32(r_row2_, r_row3_); - let g_row23 = _mm_packus_epi32(g_row2_, g_row3_); - let b_row23 = _mm_packus_epi32(b_row2_, b_row3_); - - let r_row = _mm_packus_epi16(r_row01, r_row23); - let g_row = _mm_packus_epi16(g_row01, g_row23); - let b_row = _mm_packus_epi16(b_row01, b_row23); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if USE_ALPHA { - let a_row01 = _mm_packus_epi32(a_row0_, a_row1_); - let a_row23 = _mm_packus_epi32(a_row2_, a_row3_); - let a_row = _mm_packus_epi16(a_row01, a_row23); - store_and_interleave_v4_u8!(dst_ptr, image_configuration, r_row, g_row, b_row, a_row); - } else { - store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); - } - - cx += 16; - } - - let zeros = _mm_setzero_si128(); - - while cx + 8 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = - sse_gamma_vld::(src_ptr_0, transfer_function); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = - sse_gamma_vld::(src_ptr_1, transfer_function); - - let r_row01 = _mm_packus_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packus_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packus_epi32(b_row0_, b_row1_); - - let r_row = _mm_packus_epi16(r_row01, zeros); - let g_row = _mm_packus_epi16(g_row01, zeros); - let b_row = _mm_packus_epi16(b_row01, zeros); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if USE_ALPHA { - let a_row01 = _mm_packus_epi32(a_row0_, a_row1_); - let a_row = _mm_packus_epi16(a_row01, zeros); - store_and_interleave_v4_half_u8!( - dst_ptr, - image_configuration, - r_row, - g_row, - b_row, - a_row - ); - } else { - store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); - } - - cx += 8; - } - - cx -} diff --git a/src/sse/mod.rs b/src/sse/mod.rs index bbfe330..536b84a 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -12,16 +12,10 @@ mod hsv_to_image; mod image_to_hsv; -mod image_to_linear_u8; - -mod linear_to_image; - mod math; mod support; -mod to_linear; - mod to_xyz_lab; mod to_xyza_laba; @@ -46,18 +40,16 @@ pub use gamma_curves::*; pub use hsv_to_image::*; pub use image_to_hsv::*; pub use image_to_jzazbz::sse_image_to_jzazbz; -pub use image_to_linear_u8::*; pub use image_to_oklab::sse_image_to_oklab; pub use jzazbz_to_image::sse_jzazbz_to_image; -pub use linear_to_image::*; pub use linear_to_planar::sse_linear_plane_to_gamma; pub use math::*; pub use oklab_to_image::sse_oklab_to_image; pub use planar_to_linear::sse_plane_to_linear; pub use support::*; -pub use to_linear::*; pub use to_sigmoidal::sse_image_to_sigmoidal_row; pub use to_xyz_lab::*; pub use to_xyza_laba::*; pub use xyz_lab_to_image::*; pub use xyza_laba_to_image::*; +pub use cie::*; \ No newline at end of file diff --git a/src/sse/oklab_to_image.rs b/src/sse/oklab_to_image.rs index 011162e..77119d1 100644 --- a/src/sse/oklab_to_image.rs +++ b/src/sse/oklab_to_image.rs @@ -7,13 +7,10 @@ use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; use crate::sse::{ - _mm_color_matrix_ps, _mm_cube_ps, perform_sse_gamma_transfer, sse_deinterleave_rgb_ps, - sse_deinterleave_rgba_ps, sse_interleave_rgb, sse_interleave_rgba, -}; -use crate::{ - load_f32_and_deinterleave, store_and_interleave_v3_half_u8, store_and_interleave_v3_u8, - store_and_interleave_v4_half_u8, store_and_interleave_v4_u8, TransferFunction, + _mm_color_matrix_ps, _mm_cube_ps, sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps, }; +use crate::sse::{sse_interleave_ps_rgb, sse_interleave_ps_rgba}; +use crate::{load_f32_and_deinterleave, store_and_interleave_v3_f32, store_and_interleave_v4_f32}; use erydanos::{_mm_cos_ps, _mm_sin_ps}; #[cfg(target_arch = "x86")] use std::arch::x86::*; @@ -23,7 +20,6 @@ use std::arch::x86_64::*; #[inline(always)] unsafe fn sse_oklab_vld( src: *const f32, - transfer: TransferFunction, oklab_target: OklabTarget, m0: __m128, m1: __m128, @@ -43,11 +39,10 @@ unsafe fn sse_oklab_vld( c6: __m128, c7: __m128, c8: __m128, -) -> (__m128i, __m128i, __m128i, __m128i) { - let v_scale_alpha = _mm_set1_ps(255f32); +) -> (__m128, __m128, __m128, __m128) { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (l, mut a, mut b, mut a_f32) = load_f32_and_deinterleave!(src, image_configuration); + let (l, mut a, mut b, a_f32) = load_f32_and_deinterleave!(src, image_configuration); if oklab_target == OklabTarget::Oklch { let a0 = _mm_mul_ps(a, _mm_cos_ps(b)); @@ -64,45 +59,17 @@ unsafe fn sse_oklab_vld( l_s = _mm_cube_ps(l_s); let (r_l, g_l, b_l) = _mm_color_matrix_ps(l_l, l_m, l_s, c0, c1, c2, c3, c4, c5, c6, c7, c8); - - let mut r_f32 = perform_sse_gamma_transfer(transfer, r_l); - let mut g_f32 = perform_sse_gamma_transfer(transfer, g_l); - let mut b_f32 = perform_sse_gamma_transfer(transfer, b_l); - - r_f32 = _mm_mul_ps(r_f32, v_scale_alpha); - g_f32 = _mm_mul_ps(g_f32, v_scale_alpha); - b_f32 = _mm_mul_ps(b_f32, v_scale_alpha); - if image_configuration.has_alpha() { - a_f32 = _mm_mul_ps(a_f32, v_scale_alpha); - } - const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - - if image_configuration.has_alpha() { - ( - _mm_cvtps_epi32(_mm_round_ps::(r_f32)), - _mm_cvtps_epi32(_mm_round_ps::(g_f32)), - _mm_cvtps_epi32(_mm_round_ps::(b_f32)), - _mm_cvtps_epi32(_mm_round_ps::(a_f32)), - ) - } else { - ( - _mm_cvtps_epi32(_mm_round_ps::(r_f32)), - _mm_cvtps_epi32(_mm_round_ps::(g_f32)), - _mm_cvtps_epi32(_mm_round_ps::(b_f32)), - _mm_set1_epi32(255), - ) - } + (r_l, g_l, b_l, a_f32) } #[target_feature(enable = "sse4.1")] pub unsafe fn sse_oklab_to_image( start_cx: usize, src: *const f32, - src_offset: u32, - dst: *mut u8, + src_offset: usize, + dst: *mut f32, dst_offset: u32, width: u32, - transfer_function: TransferFunction, ) -> usize { let target: OklabTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -133,224 +100,32 @@ pub unsafe fn sse_oklab_to_image( - src_ptr_0, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_oklab_vld::( - src_ptr_1, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = sse_oklab_vld::( - src_ptr_2, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = sse_oklab_vld::( - src_ptr_3, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let r_row01 = _mm_packus_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packus_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packus_epi32(b_row0_, b_row1_); - - let r_row23 = _mm_packus_epi32(r_row2_, r_row3_); - let g_row23 = _mm_packus_epi32(g_row2_, g_row3_); - let b_row23 = _mm_packus_epi32(b_row2_, b_row3_); - - let r_row = _mm_packus_epi16(r_row01, r_row23); - let g_row = _mm_packus_epi16(g_row01, g_row23); - let b_row = _mm_packus_epi16(b_row01, b_row23); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if image_configuration.has_alpha() { - let a_row01 = _mm_packus_epi32(a_row0_, a_row1_); - let a_row23 = _mm_packus_epi32(a_row2_, a_row3_); - let a_row = _mm_packus_epi16(a_row01, a_row23); - store_and_interleave_v4_u8!(dst_ptr, image_configuration, r_row, g_row, b_row, a_row); - } else { - store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); + while cx + 4 < width as usize { + let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels); let src_ptr_0 = offset_src_ptr; let (r_row0_, g_row0_, b_row0_, a_row0_) = sse_oklab_vld::( - src_ptr_0, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_oklab_vld::( - src_ptr_1, - transfer_function, - target, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, + src_ptr_0, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, c5, c6, c7, c8, ); - let r_row01 = _mm_packus_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packus_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packus_epi32(b_row0_, b_row1_); - - let r_row = _mm_packus_epi16(r_row01, zeros); - let g_row = _mm_packus_epi16(g_row01, zeros); - let b_row = _mm_packus_epi16(b_row01, zeros); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset as usize) as *mut f32).add(cx * channels); if image_configuration.has_alpha() { - let a_row01 = _mm_packus_epi32(a_row0_, a_row1_); - let a_row = _mm_packus_epi16(a_row01, zeros); - store_and_interleave_v4_half_u8!( + store_and_interleave_v4_f32!( dst_ptr, image_configuration, - r_row, - g_row, - b_row, - a_row + r_row0_, + g_row0_, + b_row0_, + a_row0_ ); } else { - store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); + store_and_interleave_v3_f32!(dst_ptr, image_configuration, r_row0_, g_row0_, b_row0_); } - cx += 8; + cx += 4; } cx diff --git a/src/sse/to_linear.rs b/src/sse/to_linear.rs deleted file mode 100644 index 1706f20..0000000 --- a/src/sse/to_linear.rs +++ /dev/null @@ -1,264 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -use crate::gamma_curves::TransferFunction; -use crate::image::ImageConfiguration; -use crate::sse::*; -use crate::{ - load_u8_and_deinterleave, load_u8_and_deinterleave_half, store_and_interleave_v3_f32, - store_and_interleave_v4_f32, -}; -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -#[inline(always)] -unsafe fn sse_triple_to_linear( - r: __m128i, - g: __m128i, - b: __m128i, - transfer_function: TransferFunction, -) -> (__m128, __m128, __m128) { - let u8_scale = _mm_set1_ps(1f32 / 255f32); - let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), u8_scale); - let g_f = _mm_mul_ps(_mm_cvtepi32_ps(g), u8_scale); - let b_f = _mm_mul_ps(_mm_cvtepi32_ps(b), u8_scale); - let r_linear = perform_sse_linear_transfer(transfer_function, r_f); - let g_linear = perform_sse_linear_transfer(transfer_function, g_f); - let b_linear = perform_sse_linear_transfer(transfer_function, b_f); - (r_linear, g_linear, b_linear) -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn sse_channels_to_linear( - start_cx: usize, - src: *const u8, - src_offset: usize, - width: u32, - dst: *mut f32, - dst_offset: usize, - transfer_function: TransferFunction, -) -> usize { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let channels = image_configuration.get_channels_count(); - let mut cx = start_cx; - - let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; - - let zeros = _mm_setzero_si128(); - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave!(src_ptr, image_configuration); - - let r_low = _mm_cvtepu8_epi16(r_chan); - let g_low = _mm_cvtepu8_epi16(g_chan); - let b_low = _mm_cvtepu8_epi16(b_chan); - - let r_low_low = _mm_cvtepu16_epi32(r_low); - let g_low_low = _mm_cvtepu16_epi32(g_low); - let b_low_low = _mm_cvtepu16_epi32(b_low); - - let (x_low_low, y_low_low, z_low_low) = - sse_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function); - - let a_low = _mm_cvtepu8_epi16(a_chan); - - let u8_scale = _mm_set1_ps(1f32 / 255f32); - - if USE_ALPHA { - let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale); - - let ptr = dst_ptr.add(cx * 4); - store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low, - a_low_low - ); - } else { - let ptr = dst_ptr.add(cx * 3); - store_and_interleave_v3_f32!(ptr, image_configuration, x_low_low, y_low_low, z_low_low); - } - - let r_low_high = _mm_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm_unpackhi_epi16(b_low, zeros); - - let (x_low_high, y_low_high, z_low_high) = - sse_triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function); - - if USE_ALPHA { - let a_low_high = - _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale); - - let ptr = dst_ptr.add(cx * 4 + 16); - store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high, - a_low_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 4 * 3); - store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high - ); - } - - let r_high = _mm_unpackhi_epi8(r_chan, zeros); - let g_high = _mm_unpackhi_epi8(g_chan, zeros); - let b_high = _mm_unpackhi_epi8(b_chan, zeros); - - let r_high_low = _mm_cvtepu16_epi32(r_high); - let g_high_low = _mm_cvtepu16_epi32(g_high); - let b_high_low = _mm_cvtepu16_epi32(b_high); - - let (x_high_low, y_high_low, z_high_low) = - sse_triple_to_linear(r_high_low, g_high_low, b_high_low, transfer_function); - - let a_high = _mm_unpackhi_epi8(a_chan, zeros); - - if USE_ALPHA { - let a_high_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale); - - let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 2); - store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_high_low, - y_high_low, - z_high_low, - a_high_low - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 2); - store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_high_low, - y_high_low, - z_high_low - ); - } - - let r_high_high = _mm_unpackhi_epi16(r_high, zeros); - let g_high_high = _mm_unpackhi_epi16(g_high, zeros); - let b_high_high = _mm_unpackhi_epi16(b_high, zeros); - - let (x_high_high, y_high_high, z_high_high) = - sse_triple_to_linear(r_high_high, g_high_high, b_high_high, transfer_function); - - if USE_ALPHA { - let a_high_high = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale); - - let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 3); - store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_high_high, - y_high_high, - z_high_high, - a_high_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 3); - store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_high_high, - y_high_high, - z_high_high - ); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_half!(src_ptr, image_configuration); - - let r_low = _mm_cvtepu8_epi16(r_chan); - let g_low = _mm_cvtepu8_epi16(g_chan); - let b_low = _mm_cvtepu8_epi16(b_chan); - - let r_low_low = _mm_cvtepu16_epi32(r_low); - let g_low_low = _mm_cvtepu16_epi32(g_low); - let b_low_low = _mm_cvtepu16_epi32(b_low); - - let (x_low_low, y_low_low, z_low_low) = - sse_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function); - - let a_low = _mm_cvtepu8_epi16(a_chan); - - let u8_scale = _mm_set1_ps(1f32 / 255f32); - - if USE_ALPHA { - let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale); - - let ptr = dst_ptr.add(cx * 4); - store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low, - a_low_low - ); - } else { - let ptr = dst_ptr.add(cx * 3); - store_and_interleave_v3_f32!(ptr, image_configuration, x_low_low, y_low_low, z_low_low); - } - - let r_low_high = _mm_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm_unpackhi_epi16(b_low, zeros); - - let (x_low_high, y_low_high, z_low_high) = - sse_triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function); - - if USE_ALPHA { - let a_low_high = - _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale); - - let ptr = dst_ptr.add(cx * 4 + 16); - store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high, - a_low_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 4 * 3); - store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high - ); - } - - cx += 8; - } - - cx -} diff --git a/src/sse/to_xyz_lab.rs b/src/sse/to_xyz_lab.rs index e04019e..af7416d 100644 --- a/src/sse/to_xyz_lab.rs +++ b/src/sse/to_xyz_lab.rs @@ -5,12 +5,11 @@ * // license that can be found in the LICENSE file. */ -use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; use crate::sse::cie::{sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz}; use crate::sse::*; use crate::xyz_target::XyzTarget; -use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half}; +use crate::load_f32_and_deinterleave; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -23,7 +22,7 @@ pub unsafe fn sse_channels_to_xyz_or_lab< const TARGET: u8, >( start_cx: usize, - src: *const u8, + src: *const f32, src_offset: usize, width: u32, dst: *mut f32, @@ -31,7 +30,6 @@ pub unsafe fn sse_channels_to_xyz_or_lab< a_linearized: *mut f32, a_offset: usize, matrix: &[[f32; 3]; 3], - transfer_function: TransferFunction, ) -> usize { if USE_ALPHA && a_linearized.is_null() { panic!("Null alpha channel with requirements of linearized alpha if not supported"); @@ -53,25 +51,15 @@ pub unsafe fn sse_channels_to_xyz_or_lab< let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; - let zeros = _mm_setzero_si128(); - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); + while cx + 4 < width as usize { + let src_ptr = ((src as * const u8).add(src_offset) as *const f32).add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave!(src_ptr, image_configuration); - - let r_low = _mm_cvtepu8_epi16(r_chan); - let g_low = _mm_cvtepu8_epi16(g_chan); - let b_low = _mm_cvtepu8_epi16(b_chan); - - let r_low_low = _mm_cvtepu16_epi32(r_low); - let g_low_low = _mm_cvtepu16_epi32(g_low); - let b_low_low = _mm_cvtepu16_epi32(b_low); + load_f32_and_deinterleave!(src_ptr, image_configuration); let (mut x_low_low, mut y_low_low, mut z_low_low) = sse_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, + r_chan, + g_chan, + b_chan, cq1, cq2, cq3, @@ -81,7 +69,6 @@ pub unsafe fn sse_channels_to_xyz_or_lab< cq7, cq8, cq9, - transfer_function, ); match target { @@ -111,307 +98,13 @@ pub unsafe fn sse_channels_to_xyz_or_lab< _mm_storeu_ps(dst_ptr.add(cx * 3 + 4), v1); _mm_storeu_ps(dst_ptr.add(cx * 3 + 8), v2); - let r_low_high = _mm_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm_unpackhi_epi16(b_low, zeros); - - let (mut x_low_high, mut y_low_high, mut z_low_high) = sse_triple_to_xyz( - r_low_high, - g_low_high, - b_low_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = sse_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_high, y_low_high, z_low_high); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3), v0); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 + 4), v1); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 + 8), v2); - - let r_high = _mm_unpackhi_epi8(r_chan, zeros); - let g_high = _mm_unpackhi_epi8(g_chan, zeros); - let b_high = _mm_unpackhi_epi8(b_chan, zeros); - - let r_high_low = _mm_cvtepu16_epi32(r_high); - let g_high_low = _mm_cvtepu16_epi32(g_high); - let b_high_low = _mm_cvtepu16_epi32(b_high); - - let (mut x_high_low, mut y_high_low, mut z_high_low) = sse_triple_to_xyz( - r_high_low, - g_high_low, - b_high_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = sse_triple_to_lab(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = a; - z_high_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = u; - z_high_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = c; - z_high_low = h; - } - } - - let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_low, y_high_low, z_high_low); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 2), v0); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 2 + 4), v1); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 2 + 8), v2); - - let r_high_high = _mm_unpackhi_epi16(r_high, zeros); - let g_high_high = _mm_unpackhi_epi16(g_high, zeros); - let b_high_high = _mm_unpackhi_epi16(b_high, zeros); - - let (mut x_high_high, mut y_high_high, mut z_high_high) = sse_triple_to_xyz( - r_high_high, - g_high_high, - b_high_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = sse_triple_to_lab(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = a; - z_high_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = u; - z_high_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = c; - z_high_high = h; - } - } - - let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_high, y_high_high, z_high_high); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 3), v0); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 3 + 4), v1); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 3 + 8), v2); - if USE_ALPHA { let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32; - let a_low = _mm_cvtepu8_epi16(a_chan); - - let u8_scale = _mm_set1_ps(1f32 / 255f32); - - let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale); - - _mm_storeu_ps(a_ptr.add(cx), a_low_low); - - let a_low_high = _mm_mul_ps( - _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128::<8>(a_low))), - u8_scale, - ); - - _mm_storeu_ps(a_ptr.add(cx + 4), a_low_high); - - let a_high = _mm_unpackhi_epi8(a_chan, zeros); - - let a_high_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale); - - _mm_storeu_ps(a_ptr.add(cx + 4 * 2), a_high_low); - - let a_high_high = - _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_high, zeros)), u8_scale); - - _mm_storeu_ps(a_ptr.add(cx + 4 * 3), a_high_high); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_half!(src_ptr, image_configuration); - - let r_low = _mm_cvtepu8_epi16(r_chan); - let g_low = _mm_cvtepu8_epi16(g_chan); - let b_low = _mm_cvtepu8_epi16(b_chan); - - let r_low_low = _mm_cvtepu16_epi32(r_low); - let g_low_low = _mm_cvtepu16_epi32(g_low); - let b_low_low = _mm_cvtepu16_epi32(b_low); - - let (mut x_low_low, mut y_low_low, mut z_low_low) = sse_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = sse_triple_to_lab(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = a; - z_low_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = u; - z_low_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = c; - z_low_low = h; - } - } - - let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_low, y_low_low, z_low_low); - _mm_storeu_ps(dst_ptr.add(cx * 3), v0); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4), v1); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 8), v2); - - let r_low_high = _mm_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm_unpackhi_epi16(b_low, zeros); - - let (mut x_low_high, mut y_low_high, mut z_low_high) = sse_triple_to_xyz( - r_low_high, - g_low_high, - b_low_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = sse_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_high, y_low_high, z_low_high); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3), v0); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 + 4), v1); - _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 + 8), v2); - - if USE_ALPHA { - let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32; - - let a_low = _mm_cvtepu8_epi16(a_chan); - - let u8_scale = _mm_set1_ps(1f32 / 255f32); - - let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale); - - _mm_storeu_ps(a_ptr.add(cx), a_low_low); - - let a_low_high = _mm_mul_ps( - _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128::<8>(a_low))), - u8_scale, - ); - - _mm_storeu_ps(a_ptr.add(cx + 4), a_low_high); + _mm_storeu_ps(a_ptr.add(cx), a_chan); } - cx += 8; + cx += 4; } cx diff --git a/src/sse/to_xyza_laba.rs b/src/sse/to_xyza_laba.rs index cb236fb..30f5b73 100644 --- a/src/sse/to_xyza_laba.rs +++ b/src/sse/to_xyza_laba.rs @@ -5,12 +5,11 @@ * // license that can be found in the LICENSE file. */ -use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; use crate::sse::cie::{sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz}; use crate::sse::*; use crate::xyz_target::XyzTarget; -use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half, store_and_interleave_v4_f32}; +use crate::load_f32_and_deinterleave; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -19,13 +18,12 @@ use std::arch::x86_64::*; #[target_feature(enable = "sse4.1")] pub unsafe fn sse_channels_to_xyza_laba( start_cx: usize, - src: *const u8, + src: *const f32, src_offset: usize, width: u32, dst: *mut f32, dst_offset: usize, matrix: &[[f32; 3]; 3], - transfer_function: TransferFunction, ) -> usize { const CHANNELS: usize = 4; let target: XyzTarget = TARGET.into(); @@ -48,264 +46,13 @@ pub unsafe fn sse_channels_to_xyza_laba { - let (l, a, b) = sse_triple_to_lab(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = a; - z_low_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = u; - z_low_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_low_low, y_low_low, z_low_low); - x_low_low = l; - y_low_low = c; - z_low_low = h; - } - } - - let a_low = _mm_cvtepu8_epi16(a_chan); - let u8_scale = _mm_set1_ps(1f32 / 255f32); - let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale); - - let (v0, v1, v2, v3) = sse_interleave_ps_rgba(x_low_low, y_low_low, z_low_low, a_low_low); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS), v0); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4), v1); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 8), v2); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 12), v3); - - let r_low_high = _mm_unpackhi_epi16(r_low, zeros); - let g_low_high = _mm_unpackhi_epi16(g_low, zeros); - let b_low_high = _mm_unpackhi_epi16(b_low, zeros); - - let (mut x_low_high, mut y_low_high, mut z_low_high) = sse_triple_to_xyz( - r_low_high, - g_low_high, - b_low_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = sse_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let a_low_high = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale); - - let ptr0 = dst_ptr.add(cx * CHANNELS + 4 * CHANNELS); - store_and_interleave_v4_f32!( - ptr0, - image_configuration, - x_low_high, - y_low_high, - z_low_high, - a_low_high - ); - - let r_high = _mm_unpackhi_epi8(r_chan, _mm_setzero_si128()); - let g_high = _mm_unpackhi_epi8(g_chan, _mm_setzero_si128()); - let b_high = _mm_unpackhi_epi8(b_chan, _mm_setzero_si128()); - - let r_high_low = _mm_cvtepu16_epi32(r_high); - let g_high_low = _mm_cvtepu16_epi32(g_high); - let b_high_low = _mm_cvtepu16_epi32(b_high); - - let (mut x_high_low, mut y_high_low, mut z_high_low) = sse_triple_to_xyz( - r_high_low, - g_high_low, - b_high_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = sse_triple_to_lab(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = a; - z_high_low = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = u; - z_high_low = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_high_low, y_high_low, z_high_low); - x_high_low = l; - y_high_low = c; - z_high_low = h; - } - } - - let a_high = _mm_unpackhi_epi8(a_chan, _mm_setzero_si128()); - - let a_high_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale); - - let (v0, v1, v2, v3) = - sse_interleave_ps_rgba(x_high_low, y_high_low, z_high_low, a_high_low); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 2), v0); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 2 + 4), v1); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 2 + 8), v2); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 2 + 12), v3); - - let r_high_high = _mm_unpackhi_epi16(r_high, _mm_setzero_si128()); - let g_high_high = _mm_unpackhi_epi16(g_high, _mm_setzero_si128()); - let b_high_high = _mm_unpackhi_epi16(b_high, _mm_setzero_si128()); - - let (mut x_high_high, mut y_high_high, mut z_high_high) = sse_triple_to_xyz( - r_high_high, - g_high_high, - b_high_high, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, - ); - - match target { - XyzTarget::Lab => { - let (l, a, b) = sse_triple_to_lab(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = a; - z_high_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = u; - z_high_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_high_high, y_high_high, z_high_high); - x_high_high = l; - y_high_high = c; - z_high_high = h; - } - } - - let a_high_high = _mm_mul_ps( - _mm_cvtepi32_ps(_mm_unpackhi_epi16(a_high, _mm_setzero_si128())), - u8_scale, - ); - - let (v0, v1, v2, v3) = - sse_interleave_ps_rgba(x_high_high, y_high_high, z_high_high, a_high_high); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 3), v0); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 3 + 4), v1); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 3 + 8), v2); - _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 3 + 12), v3); - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); + while cx + 4 < width as usize { + let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_half!(src_ptr, image_configuration); - - let r_low = _mm_cvtepu8_epi16(r_chan); - let g_low = _mm_cvtepu8_epi16(g_chan); - let b_low = _mm_cvtepu8_epi16(b_chan); - - let r_low_low = _mm_cvtepu16_epi32(r_low); - let g_low_low = _mm_cvtepu16_epi32(g_low); - let b_low_low = _mm_cvtepu16_epi32(b_low); + load_f32_and_deinterleave!(src_ptr, image_configuration); let (mut x_low_low, mut y_low_low, mut z_low_low) = sse_triple_to_xyz( - r_low_low, - g_low_low, - b_low_low, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, - transfer_function, + r_chan, g_chan, b_chan, cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9, ); match target { @@ -330,71 +77,13 @@ pub unsafe fn sse_channels_to_xyza_laba { - let (l, a, b) = sse_triple_to_lab(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = a; - z_low_high = b; - } - XyzTarget::Xyz => {} - XyzTarget::Luv => { - let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = u; - z_low_high = v; - } - XyzTarget::Lch => { - let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high); - x_low_high = l; - y_low_high = c; - z_low_high = h; - } - } - - let a_low_high = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale); - - let ptr0 = dst_ptr.add(cx * CHANNELS + 4 * CHANNELS); - store_and_interleave_v4_f32!( - ptr0, - image_configuration, - x_low_high, - y_low_high, - z_low_high, - a_low_high - ); - - cx += 8; + cx += 4; } cx diff --git a/src/sse/xyz_lab_to_image.rs b/src/sse/xyz_lab_to_image.rs index 28b9098..db8bb99 100644 --- a/src/sse/xyz_lab_to_image.rs +++ b/src/sse/xyz_lab_to_image.rs @@ -7,25 +7,22 @@ use crate::image::ImageConfiguration; use crate::sse::cie::{sse_lab_to_xyz, sse_lch_to_xyz, sse_luv_to_xyz}; -use crate::sse::{ - _mm_color_matrix_ps, perform_sse_gamma_transfer, sse_deinterleave_rgb_ps, sse_interleave_rgb, - sse_interleave_rgba, -}; +use crate::sse::{_mm_color_matrix_ps, sse_deinterleave_rgb_ps}; +use crate::sse::{sse_interleave_ps_rgb, sse_interleave_ps_rgba}; use crate::xyz_target::XyzTarget; -use crate::TransferFunction; +use crate::{store_and_interleave_v3_f32, store_and_interleave_v4_f32}; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; #[inline(always)] -unsafe fn sse_xyz_lab_vld< +pub unsafe fn sse_xyz_lab_vld< const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, const TARGET: u8, >( src: *const f32, - transfer_function: TransferFunction, c1: __m128, c2: __m128, c3: __m128, @@ -35,9 +32,8 @@ unsafe fn sse_xyz_lab_vld< c7: __m128, c8: __m128, c9: __m128, -) -> (__m128i, __m128i, __m128i) { +) -> (__m128, __m128, __m128) { let target: XyzTarget = TARGET.into(); - let v_scale_color = _mm_set1_ps(255f32); let lab_pixel_0 = _mm_loadu_ps(src); let lab_pixel_1 = _mm_loadu_ps(src.add(4)); let lab_pixel_2 = _mm_loadu_ps(src.add(8)); @@ -68,23 +64,7 @@ unsafe fn sse_xyz_lab_vld< let (linear_r, linear_g, linear_b) = _mm_color_matrix_ps(r_f32, g_f32, b_f32, c1, c2, c3, c4, c5, c6, c7, c8, c9); - - r_f32 = linear_r; - g_f32 = linear_g; - b_f32 = linear_b; - - r_f32 = perform_sse_gamma_transfer(transfer_function, r_f32); - g_f32 = perform_sse_gamma_transfer(transfer_function, g_f32); - b_f32 = perform_sse_gamma_transfer(transfer_function, b_f32); - r_f32 = _mm_mul_ps(r_f32, v_scale_color); - g_f32 = _mm_mul_ps(g_f32, v_scale_color); - b_f32 = _mm_mul_ps(b_f32, v_scale_color); - const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - ( - _mm_cvtps_epi32(_mm_round_ps::(r_f32)), - _mm_cvtps_epi32(_mm_round_ps::(g_f32)), - _mm_cvtps_epi32(_mm_round_ps::(b_f32)), - ) + (linear_r, linear_g, linear_b) } #[target_feature(enable = "sse4.1")] @@ -98,11 +78,10 @@ pub unsafe fn sse_xyz_to_channels< src_offset: usize, a_channel: *const f32, a_offset: usize, - dst: *mut u8, + dst: *mut f32, dst_offset: usize, width: u32, matrix: &[[f32; 3]; 3], - transfer_function: TransferFunction, ) -> usize { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); if USE_ALPHA && !image_configuration.has_alpha() { @@ -125,11 +104,7 @@ pub unsafe fn sse_xyz_to_channels< let src_channels = 3usize; - let color_rescale = _mm_set1_ps(255f32); - - let zeros = _mm_setzero_si128(); - - while cx + 16 < width as usize { + while cx + 4 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels); @@ -137,233 +112,28 @@ pub unsafe fn sse_xyz_to_channels< let (r_row0_, g_row0_, b_row0_) = sse_xyz_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * src_channels); - - let (r_row1_, g_row1_, b_row1_) = - sse_xyz_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * src_channels); - - let (r_row2_, g_row2_, b_row2_) = - sse_xyz_lab_vld::( - src_ptr_2, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, + src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ); - let src_ptr_3 = offset_src_ptr.add(4 * 3 * src_channels); - - let (r_row3_, g_row3_, b_row3_) = - sse_xyz_lab_vld::( - src_ptr_3, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let r_row01 = _mm_packs_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packs_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packs_epi32(b_row0_, b_row1_); - - let r_row23 = _mm_packs_epi32(r_row2_, r_row3_); - let g_row23 = _mm_packs_epi32(g_row2_, g_row3_); - let b_row23 = _mm_packs_epi32(b_row2_, b_row3_); - - let r_row = _mm_packus_epi16(r_row01, r_row23); - let g_row = _mm_packus_epi16(g_row01, g_row23); - let b_row = _mm_packus_epi16(b_row01, b_row23); - - let dst_ptr = dst.add(dst_offset + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels); if USE_ALPHA { - const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); - let a_low_0_f = _mm_loadu_ps(offset_a_src_ptr); - let a_row0_ = _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( - a_low_0_f, - color_rescale, - ))); - - let a_low_1_f = _mm_loadu_ps(offset_a_src_ptr.add(4)); - let a_row1_ = _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( - a_low_1_f, - color_rescale, - ))); - - let a_low_2_f = _mm_loadu_ps(offset_a_src_ptr.add(8)); - let a_row2_ = _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( - a_low_2_f, - color_rescale, - ))); - - let a_low_3_f = _mm_loadu_ps(offset_a_src_ptr.add(12)); - let a_row3_ = _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( - a_low_3_f, - color_rescale, - ))); - - let a_row01 = _mm_packs_epi32(a_row0_, a_row1_); - let a_row23 = _mm_packs_epi32(a_row2_, a_row3_); - let a_row = _mm_packus_epi16(a_row01, a_row23); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - sse_interleave_rgba(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - sse_interleave_rgba(b_row, g_row, r_row, a_row) - } - }; - _mm_storeu_si128(dst_ptr as *mut __m128i, store_rows.0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, store_rows.1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, store_rows.2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, store_rows.3); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - sse_interleave_rgb(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - sse_interleave_rgb(b_row, g_row, r_row) - } - }; - _mm_storeu_si128(dst_ptr as *mut __m128i, store_rows.0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, store_rows.1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, store_rows.2); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_) = - sse_xyz_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, + let a_row = _mm_loadu_ps(offset_a_src_ptr); + + store_and_interleave_v4_f32!( + dst_ptr, + image_configuration, + r_row0_, + g_row0_, + b_row0_, + a_row ); - - let src_ptr_1 = offset_src_ptr.add(4 * src_channels); - - let (r_row1_, g_row1_, b_row1_) = - sse_xyz_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let r_row01 = _mm_packs_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packs_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packs_epi32(b_row0_, b_row1_); - - let r_row = _mm_packus_epi16(r_row01, zeros); - let g_row = _mm_packus_epi16(g_row01, zeros); - let b_row = _mm_packus_epi16(b_row01, zeros); - - let dst_ptr = dst.add(dst_offset + cx * channels); - - if USE_ALPHA { - const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx); - let a_low_0_f = _mm_loadu_ps(offset_a_src_ptr); - let a_row0_ = _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( - a_low_0_f, - color_rescale, - ))); - - let a_low_1_f = _mm_loadu_ps(offset_a_src_ptr.add(4)); - let a_row1_ = _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( - a_low_1_f, - color_rescale, - ))); - - let a_row01 = _mm_packs_epi32(a_row0_, a_row1_); - let a_row = _mm_packus_epi16(a_row01, zeros); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - sse_interleave_rgba(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - sse_interleave_rgba(b_row, g_row, r_row, a_row) - } - }; - _mm_storeu_si128(dst_ptr as *mut __m128i, store_rows.0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, store_rows.1); } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - sse_interleave_rgb(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - sse_interleave_rgb(b_row, g_row, r_row) - } - }; - _mm_storeu_si128(dst_ptr as *mut __m128i, store_rows.0); - let regi = store_rows.1; - std::ptr::copy_nonoverlapping(®i as *const _ as *const u8, dst_ptr.add(16), 8); + store_and_interleave_v3_f32!(dst_ptr, image_configuration, r_row0_, g_row0_, b_row0_); } - cx += 8; + cx += 4; } cx diff --git a/src/sse/xyza_laba_to_image.rs b/src/sse/xyza_laba_to_image.rs index fac74d4..d2cd846 100644 --- a/src/sse/xyza_laba_to_image.rs +++ b/src/sse/xyza_laba_to_image.rs @@ -7,20 +7,16 @@ use crate::image::ImageConfiguration; use crate::sse::cie::{sse_lab_to_xyz, sse_lch_to_xyz, sse_luv_to_xyz}; -use crate::sse::{ - _mm_color_matrix_ps, perform_sse_gamma_transfer, sse_deinterleave_rgba_ps, sse_interleave_rgba, -}; +use crate::sse::{_mm_color_matrix_ps, sse_deinterleave_rgba_ps, sse_interleave_ps_rgba}; use crate::xyz_target::XyzTarget; -use crate::TransferFunction; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; #[inline(always)] -unsafe fn sse_xyza_lab_vld( +pub unsafe fn sse_xyza_lab_vld( src: *const f32, - transfer_function: TransferFunction, c1: __m128, c2: __m128, c3: __m128, @@ -30,9 +26,8 @@ unsafe fn sse_xyza_lab_vld( c7: __m128, c8: __m128, c9: __m128, -) -> (__m128i, __m128i, __m128i, __m128i) { +) -> (__m128, __m128, __m128, __m128) { let target: XyzTarget = TARGET.into(); - let v_scale_color = _mm_set1_ps(255f32); let pixel_0 = _mm_loadu_ps(src); let pixel_1 = _mm_loadu_ps(src.add(4)); let pixel_2 = _mm_loadu_ps(src.add(8)); @@ -68,21 +63,7 @@ unsafe fn sse_xyza_lab_vld( r_f32 = linear_r; g_f32 = linear_g; b_f32 = linear_b; - - r_f32 = perform_sse_gamma_transfer(transfer_function, r_f32); - g_f32 = perform_sse_gamma_transfer(transfer_function, g_f32); - b_f32 = perform_sse_gamma_transfer(transfer_function, b_f32); - r_f32 = _mm_mul_ps(r_f32, v_scale_color); - g_f32 = _mm_mul_ps(g_f32, v_scale_color); - b_f32 = _mm_mul_ps(b_f32, v_scale_color); - let a_f32 = _mm_mul_ps(a_f32, v_scale_color); - const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - ( - _mm_cvtps_epi32(_mm_round_ps::(r_f32)), - _mm_cvtps_epi32(_mm_round_ps::(g_f32)), - _mm_cvtps_epi32(_mm_round_ps::(b_f32)), - _mm_cvtps_epi32(_mm_round_ps::(a_f32)), - ) + (r_f32, g_f32, b_f32, a_f32) } #[target_feature(enable = "sse4.1")] @@ -90,11 +71,10 @@ pub unsafe fn sse_xyza_to_image usize { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); if !image_configuration.has_alpha() { @@ -117,169 +97,32 @@ pub unsafe fn sse_xyza_to_image( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_xyza_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * CHANNELS); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = sse_xyza_lab_vld::( - src_ptr_2, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_3 = offset_src_ptr.add(4 * 3 * CHANNELS); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = sse_xyza_lab_vld::( - src_ptr_3, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, + src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ); - let r_row01 = _mm_packs_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packs_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packs_epi32(b_row0_, b_row1_); - let a_row01 = _mm_packs_epi32(a_row0_, a_row1_); - - let r_row23 = _mm_packs_epi32(r_row2_, r_row3_); - let g_row23 = _mm_packs_epi32(g_row2_, g_row3_); - let b_row23 = _mm_packs_epi32(b_row2_, b_row3_); - let a_row23 = _mm_packs_epi32(a_row2_, a_row3_); - - let r_row = _mm_packus_epi16(r_row01, r_row23); - let g_row = _mm_packus_epi16(g_row01, g_row23); - let b_row = _mm_packus_epi16(b_row01, b_row23); - let a_row = _mm_packus_epi16(a_row01, a_row23); - - let dst_ptr = dst.add(dst_offset + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels); let (rgba0, rgba1, rgba2, rgba3) = match image_configuration { ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - sse_interleave_rgba(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - sse_interleave_rgba(b_row, g_row, r_row, a_row) - } - }; - - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - - cx += 16; - } - - while cx + 8 < width as usize { - let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = sse_xyza_lab_vld::( - src_ptr_0, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_xyza_lab_vld::( - src_ptr_1, - transfer_function, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - ); - - let r_row01 = _mm_packs_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packs_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packs_epi32(b_row0_, b_row1_); - let a_row01 = _mm_packs_epi32(a_row0_, a_row1_); - - let r_row = _mm_packus_epi16(r_row01, zeros); - let g_row = _mm_packus_epi16(g_row01, zeros); - let b_row = _mm_packus_epi16(b_row01, zeros); - let a_row = _mm_packus_epi16(a_row01, zeros); - - let dst_ptr = dst.add(dst_offset + cx * channels); - - let (rgba0, rgba1, _, _) = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - sse_interleave_rgba(r_row, g_row, b_row, a_row) + sse_interleave_ps_rgba(r_row0_, g_row0_, b_row0_, a_row0_) } ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - sse_interleave_rgba(b_row, g_row, r_row, a_row) + sse_interleave_ps_rgba(b_row0_, g_row0_, r_row0_, a_row0_) } }; - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_ps(dst_ptr, rgba0); + _mm_storeu_ps(dst_ptr.add(4), rgba1); + _mm_storeu_ps(dst_ptr.add(8), rgba2); + _mm_storeu_ps(dst_ptr.add(12), rgba3); - cx += 8; + cx += 4; } cx diff --git a/src/xyz.rs b/src/xyz.rs index cbdca90..2401883 100644 --- a/src/xyz.rs +++ b/src/xyz.rs @@ -129,7 +129,7 @@ impl Xyz { /// * `matrix` - Transformation matrix from RGB to XYZ, for example `SRGB_TO_XYZ_D65` /// * `transfer_function` - Transfer functions for current colorspace #[inline] - pub fn from_linear_rgb(rgb: &Rgb, matrix: &[[f32; 3]; 3]) -> Self { + pub fn from_linear_rgb(rgb: Rgb, matrix: &[[f32; 3]; 3]) -> Self { unsafe { Self::new( (*(*matrix.get_unchecked(0)).get_unchecked(0)) * rgb.r diff --git a/src/xyz_lab_to_image.rs b/src/xyz_lab_to_image.rs index ba8c885..3cd011b 100644 --- a/src/xyz_lab_to_image.rs +++ b/src/xyz_lab_to_image.rs @@ -19,7 +19,6 @@ use crate::{LCh, Lab, Luv, Xyz, XYZ_TO_SRGB_D65}; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; fn xyz_to_channels( @@ -50,11 +49,10 @@ fn xyz_to_channels usize, > = None; @@ -73,15 +71,22 @@ fn xyz_to_channels); } + let src_slice_safe_align = unsafe { + slice::from_raw_parts_mut( + src.as_ptr() as *mut u8, + src_stride as usize * height as usize, + ) + }; + + let mut lut_table = vec![0u8; 2049]; + for i in 0..2049 { + lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.) + .ceil() + .min(255.) as u8; + } + #[cfg(feature = "rayon")] { - let src_slice_safe_align = unsafe { - slice::from_raw_parts_mut( - src.as_ptr() as *mut u8, - src_stride as usize * height as usize, - ) - }; - if USE_ALPHA { let a_slice_safe_align = unsafe { slice::from_raw_parts_mut( @@ -96,6 +101,8 @@ fn xyz_to_channels { let lab = Lab::new(l_x, l_y, l_z); - lab.to_rgb() + lab.to_linear_rgb(matrix) } XyzTarget::Xyz => { let xyz = Xyz::new(l_x, l_y, l_z); - xyz.to_rgb(matrix, transfer_function) + xyz.to_linear_rgb(matrix) } XyzTarget::Luv => { let luv = Luv::new(l_x, l_y, l_z); - luv.to_rgb() + luv.to_linear_rgb(matrix) } XyzTarget::Lch => { let lch = LCh::new(l_x, l_y, l_z); - lch.to_rgb() + lch.to_linear_rgb(matrix) } }; - let dst = dst_ptr.add(x * channels); - - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; if image_configuration.has_alpha() { let a_ptr = a_channel.as_ptr() as *const f32; let a_f = a_ptr.add(x).read_unaligned(); - let a_value = (a_f * 255f32).max(0f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_value as u8); + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = + a_f; + } + } + + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round() as usize; + let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round() as usize; + let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round() as usize; + + dst_chunk[0] = *lut_table.get_unchecked(r_cast.min(2048)); + dst_chunk[1] = *lut_table.get_unchecked(g_cast.min(2048)); + dst_chunk[2] = *lut_table.get_unchecked(b_cast.min(2048)); + + if image_configuration.has_alpha() { + let a_cast = (src_chunks[3] * 255.).min(255.).max(0.) as u8; + dst_chunk[3] = a_cast; } } }); @@ -161,23 +179,23 @@ fn xyz_to_channels { let lab = Lab::new(l_x, l_y, l_z); - lab.to_rgb() + lab.to_linear_rgb(matrix) } XyzTarget::Xyz => { let xyz = Xyz::new(l_x, l_y, l_z); - xyz.to_rgb(matrix, transfer_function) + xyz.to_linear_rgb(matrix) } XyzTarget::Luv => { let luv = Luv::new(l_x, l_y, l_z); - luv.to_rgb() + luv.to_linear_rgb(matrix) } XyzTarget::Lch => { let lch = LCh::new(l_x, l_y, l_z); - lch.to_rgb() + lch.to_linear_rgb(matrix) } }; - let dst = dst_ptr.add(x * channels); + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; + } - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round() as usize; + let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round() as usize; + let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round() as usize; + + dst_chunk[0] = *lut_table.get_unchecked(r_cast.min(2048)); + dst_chunk[1] = *lut_table.get_unchecked(g_cast.min(2048)); + dst_chunk[2] = *lut_table.get_unchecked(b_cast.min(2048)); } }); } @@ -218,82 +245,165 @@ fn xyz_to_channels { - let lab = Lab::new(l_x, l_y, l_z); - lab.to_rgb() + let mut transient_row = vec![0f32; width as usize * channels]; + + if let Some(dispatcher) = _wide_row_handler { + _cx = dispatcher( + _cx, + src.as_ptr() as *const f32, + 0, + a_channel.as_ptr() as *const f32, + 0, + transient_row.as_mut_ptr(), + 0, + width, + matrix, + ); } - XyzTarget::Xyz => { - let xyz = Xyz::new(l_x, l_y, l_z); - xyz.to_rgb(matrix, transfer_function) + + let src_ptr = src.as_ptr() as *mut f32; + + for x in _cx..width as usize { + let src_slice = src_ptr.add(x * 3); + let l_x = src_slice.read_unaligned(); + let l_y = src_slice.add(1).read_unaligned(); + let l_z = src_slice.add(2).read_unaligned(); + let rgb = match source { + XyzTarget::Lab => { + let lab = Lab::new(l_x, l_y, l_z); + lab.to_linear_rgb(matrix) + } + XyzTarget::Xyz => { + let xyz = Xyz::new(l_x, l_y, l_z); + xyz.to_linear_rgb(matrix) + } + XyzTarget::Luv => { + let luv = Luv::new(l_x, l_y, l_z); + luv.to_linear_rgb(matrix) + } + XyzTarget::Lch => { + let lch = LCh::new(l_x, l_y, l_z); + lch.to_linear_rgb(matrix) + } + }; + + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; + if image_configuration.has_alpha() { + let a_ptr = a_channel.as_ptr() as *const f32; + let a_f = a_ptr.add(x).read_unaligned(); + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = + a_f; + } } - XyzTarget::Luv => { - let luv = Luv::new(l_x, l_y, l_z); - luv.to_rgb() + + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round() as usize; + let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round() as usize; + let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round() as usize; + + dst_chunk[0] = *lut_table.get_unchecked(r_cast.min(2048)); + dst_chunk[1] = *lut_table.get_unchecked(g_cast.min(2048)); + dst_chunk[2] = *lut_table.get_unchecked(b_cast.min(2048)); + + if image_configuration.has_alpha() { + let a_cast = (src_chunks[3] * 255.).min(255.).max(0.) as u8; + dst_chunk[3] = a_cast; + } } - XyzTarget::Lch => { - let lch = LCh::new(l_x, l_y, l_z); - lch.to_rgb() + } + } + } else { + for (dst, src) in dst + .chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.chunks_exact_mut(src_stride as usize)) + { + unsafe { + let mut _cx = 0usize; + + let mut transient_row = vec![0f32; width as usize * channels]; + + if let Some(dispatcher) = _wide_row_handler { + _cx = dispatcher( + _cx, + src.as_ptr() as *const f32, + 0, + a_channel.as_ptr(), + 0, + transient_row.as_mut_ptr(), + 0, + width, + matrix, + ); } - }; - let dst = unsafe { dst_ptr.add(x * channels) }; + let src_ptr = src.as_ptr() as *mut f32; - unsafe { - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); - } - if image_configuration.has_alpha() { - let a_ptr = - unsafe { (a_channel.as_ptr() as *const u8).add(a_offset) as *const f32 }; - let a_f = unsafe { a_ptr.add(x).read_unaligned() }; - let a_value = (a_f * 255f32).max(0f32); - unsafe { - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_value as u8); + for x in _cx..width as usize { + let src_slice = src_ptr.add(x * 3); + let l_x = src_slice.read_unaligned(); + let l_y = src_slice.add(1).read_unaligned(); + let l_z = src_slice.add(2).read_unaligned(); + let rgb = match source { + XyzTarget::Lab => { + let lab = Lab::new(l_x, l_y, l_z); + lab.to_linear_rgb(matrix) + } + XyzTarget::Xyz => { + let xyz = Xyz::new(l_x, l_y, l_z); + xyz.to_linear_rgb(matrix) + } + XyzTarget::Luv => { + let luv = Luv::new(l_x, l_y, l_z); + luv.to_linear_rgb(matrix) + } + XyzTarget::Lch => { + let lch = LCh::new(l_x, l_y, l_z); + lch.to_linear_rgb(matrix) + } + }; + + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; + } + + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round() as usize; + let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round() as usize; + let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round() as usize; + + dst_chunk[0] = *lut_table.get_unchecked(r_cast.min(2048)); + dst_chunk[1] = *lut_table.get_unchecked(g_cast.min(2048)); + dst_chunk[2] = *lut_table.get_unchecked(b_cast.min(2048)); } } } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; - a_offset += a_stride as usize; } } } diff --git a/src/xyza_laba_to_image.rs b/src/xyza_laba_to_image.rs index 9032b3e..6e7048c 100644 --- a/src/xyza_laba_to_image.rs +++ b/src/xyza_laba_to_image.rs @@ -19,7 +19,6 @@ use crate::{LCh, Lab, Luv, Xyz, XYZ_TO_SRGB_D65}; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[allow(clippy::type_complexity)] @@ -40,16 +39,7 @@ fn xyz_with_alpha_to_channels usize, + unsafe fn(usize, *const f32, usize, *mut f32, usize, u32, &[[f32; 3]; 3]) -> usize, > = None; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] @@ -67,14 +57,22 @@ fn xyz_with_alpha_to_channels); } + let mut lut_table = vec![0u8; 2049]; + for i in 0..2049 { + lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.) + .ceil() + .min(255.) as u8; + } + + let src_slice_safe_align = unsafe { + slice::from_raw_parts( + src.as_ptr() as *const u8, + src_stride as usize * height as usize, + ) + }; + #[cfg(feature = "rayon")] { - let src_slice_safe_align = unsafe { - slice::from_raw_parts( - src.as_ptr() as *const u8, - src_stride as usize * height as usize, - ) - }; dst.par_chunks_exact_mut(dst_stride as usize) .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) .for_each(|(dst, src)| unsafe { @@ -82,21 +80,21 @@ fn xyz_with_alpha_to_channels { let lab = Lab::new(l_x, l_y, l_z); - lab.to_rgb() + lab.to_linear_rgb(matrix) } XyzTarget::Xyz => { let xyz = Xyz::new(l_x, l_y, l_z); - xyz.to_rgb(matrix, transfer_function) + xyz.to_linear_rgb(matrix) } XyzTarget::Luv => { let luv = Luv::new(l_x, l_y, l_z); - luv.to_rgb() + luv.to_linear_rgb(matrix) } XyzTarget::Lch => { let lch = LCh::new(l_x, l_y, l_z); - lch.to_rgb() + lch.to_linear_rgb(matrix) } }; let l_a = src_ptr.add(px + 3).read_unaligned(); - let a_value = (l_a * 255f32).max(0f32); - let dst = dst_ptr.add(x * channels); - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_value as u8); + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; + } + + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round(); + let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round(); + let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round(); + let a_cast = (src_chunks[3] * 255.).min(255.).max(0.) as u8; + + dst_chunk[0] = *lut_table.get_unchecked(r_cast as usize); + dst_chunk[1] = *lut_table.get_unchecked(g_cast as usize); + dst_chunk[2] = *lut_table.get_unchecked(b_cast as usize); + dst_chunk[3] = a_cast; } }); } #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; + for (dst, src) in dst + .chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) + { + unsafe { + let channels = image_configuration.get_channels_count(); - let channels = image_configuration.get_channels_count(); + let mut _cx = 0usize; - for _ in 0..height as usize { - let mut _cx = 0usize; + let mut transient_row = vec![0f32; width as usize * channels]; - if let Some(dispatcher) = _wide_row_handler { - unsafe { + if let Some(dispatcher) = _wide_row_handler { _cx = dispatcher( _cx, - src.as_ptr(), - src_offset, - dst.as_mut_ptr(), - dst_offset, + src.as_ptr() as *const f32, + 0, + transient_row.as_mut_ptr(), + 0, width, matrix, - transfer_function, ) } - } - let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 }; - let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) }; - - for x in _cx..width as usize { - let px = x * 4; - let l_x = unsafe { src_ptr.add(px).read_unaligned() }; - let l_y = unsafe { src_ptr.add(px + 1).read_unaligned() }; - let l_z = unsafe { src_ptr.add(px + 2).read_unaligned() }; - let rgb = match source { - XyzTarget::Lab => { - let lab = Lab::new(l_x, l_y, l_z); - lab.to_rgb() - } - XyzTarget::Xyz => { - let xyz = Xyz::new(l_x, l_y, l_z); - xyz.to_rgb(matrix, transfer_function) - } - XyzTarget::Luv => { - let luv = Luv::new(l_x, l_y, l_z); - luv.to_rgb() - } - XyzTarget::Lch => { - let lch = LCh::new(l_x, l_y, l_z); - lch.to_rgb() - } - }; - - let l_a = unsafe { src_ptr.add(px + 3).read_unaligned() }; - let a_value = (l_a * 255f32).max(0f32); - unsafe { - let dst = dst_ptr.add(x * channels); - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_value as u8); + let src_ptr = src.as_ptr() as *mut f32; + + for x in _cx..width as usize { + let px = x * 4; + let l_x = src_ptr.add(px).read_unaligned(); + let l_y = src_ptr.add(px + 1).read_unaligned(); + let l_z = src_ptr.add(px + 2).read_unaligned(); + let rgb = match source { + XyzTarget::Lab => { + let lab = Lab::new(l_x, l_y, l_z); + lab.to_linear_rgb(matrix) + } + XyzTarget::Xyz => { + let xyz = Xyz::new(l_x, l_y, l_z); + xyz.to_linear_rgb(matrix) + } + XyzTarget::Luv => { + let luv = Luv::new(l_x, l_y, l_z); + luv.to_linear_rgb(matrix) + } + XyzTarget::Lch => { + let lch = LCh::new(l_x, l_y, l_z); + lch.to_linear_rgb(matrix) + } + }; + + let l_a = src_ptr.add(px + 3).read_unaligned(); + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; } - } - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round(); + let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round(); + let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round(); + let a_cast = (src_chunks[3] * 255.).min(255.).max(0.) as u8; + + dst_chunk[0] = *lut_table.get_unchecked(r_cast as usize); + dst_chunk[1] = *lut_table.get_unchecked(g_cast as usize); + dst_chunk[2] = *lut_table.get_unchecked(b_cast as usize); + dst_chunk[3] = a_cast; + } + } } } }