From ff40c57d1f47d3208e65db13ed44aa846af1e24a Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 28 Dec 2024 12:52:37 +0000 Subject: [PATCH] Fix discovered AVX sigmoidal fix --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/avx/oklab_to_image.rs | 3 +- src/avx/routines.rs | 86 ------------------------ src/avx/to_sigmoidal.rs | 134 +------------------------------------ src/gamma_curves.rs | 11 ++- src/image_to_lalphabeta.rs | 4 +- src/oklab.rs | 2 +- src/xyz_lab_to_image.rs | 4 +- src/xyza_laba_to_image.rs | 4 +- 10 files changed, 16 insertions(+), 236 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 847b1ab..57d154e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,7 +169,7 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorutils-rs" -version = "0.7.3" +version = "0.7.4" dependencies = [ "erydanos", "half", diff --git a/Cargo.toml b/Cargo.toml index 56dee65..af8b827 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] } [package] name = "colorutils-rs" -version = "0.7.3" +version = "0.7.4" edition = "2021" description = "High performance utilities for color format handling and conversion." readme = "README.md" diff --git a/src/avx/oklab_to_image.rs b/src/avx/oklab_to_image.rs index c590277..70d7f35 100644 --- a/src/avx/oklab_to_image.rs +++ b/src/avx/oklab_to_image.rs @@ -101,8 +101,7 @@ pub unsafe fn avx_oklab_to_image( - ptr: *const u8, -) -> (__m256i, __m256i, __m256i, __m256i) { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (r_chan, g_chan, b_chan, a_chan); - - let row1 = _mm256_loadu_si256(ptr as *const __m256i); - let row2 = _mm256_loadu_si256(ptr.add(32) as *const __m256i); - let empty_row = _mm256_setzero_si256(); - match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Bgr => { - let (c1, c2, c3) = avx2_deinterleave_rgb_epi8(row1, row2, empty_row); - if image_configuration == ImageConfiguration::Rgb { - r_chan = c1; - g_chan = c2; - b_chan = c3; - } else { - r_chan = c3; - g_chan = c2; - b_chan = c1; - } - a_chan = _mm256_set1_epi8(-128); - } - ImageConfiguration::Rgba | ImageConfiguration::Bgra => { - let (c1, c2, c3, c4) = avx2_deinterleave_rgba_epi8(row1, row2, empty_row, empty_row); - if image_configuration == ImageConfiguration::Rgba { - r_chan = c1; - g_chan = c2; - b_chan = c3; - a_chan = c4; - } else { - r_chan = c3; - g_chan = c2; - b_chan = c1; - a_chan = c4; - } - } - } - - (r_chan, g_chan, b_chan, a_chan) -} - -#[inline(always)] -pub(crate) unsafe fn avx_vld_u8_and_deinterleave_quarter( - ptr: *const u8, -) -> (__m256i, __m256i, __m256i, __m256i) { - let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (r_chan, g_chan, b_chan, a_chan); - - let row1 = _mm256_loadu_si256(ptr as *const __m256i); - let empty_row = _mm256_setzero_si256(); - match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Bgr => { - let (c1, c2, c3) = avx2_deinterleave_rgb_epi8(row1, empty_row, empty_row); - if image_configuration == ImageConfiguration::Rgb { - r_chan = c1; - g_chan = c2; - b_chan = c3; - } else { - r_chan = c3; - g_chan = c2; - b_chan = c1; - } - a_chan = _mm256_set1_epi8(-128); - } - ImageConfiguration::Rgba | ImageConfiguration::Bgra => { - let (c1, c2, c3, c4) = - avx2_deinterleave_rgba_epi8(row1, empty_row, empty_row, empty_row); - if image_configuration == ImageConfiguration::Rgba { - r_chan = c1; - g_chan = c2; - b_chan = c3; - a_chan = c4; - } else { - r_chan = c3; - g_chan = c2; - b_chan = c1; - a_chan = c4; - } - } - } - - (r_chan, g_chan, b_chan, a_chan) -} - #[inline(always)] pub(crate) unsafe fn avx_vld_f32_and_deinterleave( ptr: *const f32, diff --git a/src/avx/to_sigmoidal.rs b/src/avx/to_sigmoidal.rs index a694e76..253f431 100644 --- a/src/avx/to_sigmoidal.rs +++ b/src/avx/to_sigmoidal.rs @@ -10,10 +10,7 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use crate::avx::routines::{ - avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half, - avx_vld_u8_and_deinterleave_quarter, -}; +use crate::avx::routines::avx_vld_u8_and_deinterleave; use crate::avx::sigmoidal::avx_rgb_to_sigmoidal; use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps}; use crate::image::ImageConfiguration; @@ -192,134 +189,5 @@ pub unsafe fn avx_image_to_sigmoidal_row< cx += 32; } - while cx + 16 < width as usize { - let src_ptr = src.add(cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave_half::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); - - let (x_low_low, y_low_low, z_low_low) = - avx_rgb_to_sigmoidal(r_low_low, g_low_low, b_low_low); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - if USE_ALPHA { - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * channels); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low, - a_low_low - ); - } else { - let ptr = dst_ptr.add(cx * channels); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low - ); - } - - let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); - let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); - let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); - - let (x_low_high, y_low_high, z_low_high) = - avx_rgb_to_sigmoidal(r_low_high, g_low_high, b_low_high); - - if USE_ALPHA { - let a_low_high = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * channels + 8 * channels); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high, - a_low_high - ); - } else { - let ptr = dst_ptr.add(cx * channels + 8 * channels); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_high, - y_low_high, - z_low_high - ); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(cx * channels); - let (r_chan, g_chan, b_chan, a_chan) = - avx_vld_u8_and_deinterleave_quarter::(src_ptr); - - let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); - let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); - let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); - let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); - - let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); - let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); - let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); - - let (x_low_low, y_low_low, z_low_low) = - avx_rgb_to_sigmoidal(r_low_low, g_low_low, b_low_low); - - let u8_scale = _mm256_set1_ps(1f32 / 255f32); - - if USE_ALPHA { - let a_low_low = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * channels); - avx_store_and_interleave_v4_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low, - a_low_low - ); - } else { - let ptr = dst_ptr.add(cx * channels); - avx_store_and_interleave_v3_f32!( - ptr, - image_configuration, - x_low_low, - y_low_low, - z_low_low - ); - } - - cx += 8; - } - cx } diff --git a/src/gamma_curves.rs b/src/gamma_curves.rs index 85f5a2d..49e5890 100644 --- a/src/gamma_curves.rs +++ b/src/gamma_curves.rs @@ -258,15 +258,14 @@ pub fn hlg_to_linear(gamma: f32) -> f32 { if gamma < 0.0 { return 0.0; } - let linear; - if gamma <= 0.5 { - linear = f32::powf((gamma * gamma) * (1.0 / 3.0), 1.2); + let linear = if gamma <= 0.5 { + f32::powf((gamma * gamma) * (1.0 / 3.0), 1.2) } else { - linear = f32::powf( + f32::powf( (f32::exp((gamma - 0.55991073) / 0.17883277) + 0.28466892) / 12.0, 1.2, - ); - } + ) + }; // Scale so that SDR white is 1.0 (extended SDR). linear * HLG_WHITE_NITS / SDR_WHITE_NITS } diff --git a/src/image_to_lalphabeta.rs b/src/image_to_lalphabeta.rs index 2036f21..90ab16a 100644 --- a/src/image_to_lalphabeta.rs +++ b/src/image_to_lalphabeta.rs @@ -27,8 +27,8 @@ fn channels_to_lalphabeta( let channels = image_configuration.get_channels_count(); let mut lut_table = vec![0f32; 256]; - for i in 0..256 { - lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0)); + for (i, element) in lut_table.iter_mut().enumerate() { + *element = transfer_function.linearize(i as f32 * (1. / 255.0)); } let dst_slice_safe_align = unsafe { diff --git a/src/oklab.rs b/src/oklab.rs index 0428982..0e45cae 100644 --- a/src/oklab.rs +++ b/src/oklab.rs @@ -4,7 +4,7 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ -#[allow(clippy::excessive_precision)] +#![allow(clippy::excessive_precision)] use crate::utils::mlaf; use crate::{EuclideanDistance, Rgb, TaxicabDistance, TransferFunction}; use num_traits::Pow; diff --git a/src/xyz_lab_to_image.rs b/src/xyz_lab_to_image.rs index 557630d..b5a2fe0 100644 --- a/src/xyz_lab_to_image.rs +++ b/src/xyz_lab_to_image.rs @@ -79,8 +79,8 @@ fn xyz_to_channels