From a30ea28614d3db1becad4079ddb9e03dc11a932b Mon Sep 17 00:00:00 2001 From: awxkee Date: Mon, 22 Jul 2024 18:31:32 +0100 Subject: [PATCH] Bugxfixes in Jzazbz --- .github/workflows/build_push.yml | 2 ++ Cargo.lock | 2 +- Cargo.toml | 2 +- src/app/src/main.rs | 14 +++++------- src/neon/image_to_jzazbz.rs | 11 +++++++--- src/neon/jzazbz_to_image.rs | 11 ++++++---- src/rgb.rs | 11 +++++++++- src/sse/image_to_jzazbz.rs | 11 +++++++--- src/sse/jzazbz_to_image.rs | 12 +++++++---- src/sse/routines.rs | 3 +-- src/sse/support.rs | 37 -------------------------------- 11 files changed, 51 insertions(+), 65 deletions(-) diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml index c06d27e..743d420 100644 --- a/.github/workflows/build_push.yml +++ b/.github/workflows/build_push.yml @@ -26,7 +26,9 @@ jobs: - run: rustup target add aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu i686-unknown-linux-gnu powerpc-unknown-linux-gnu - run: RUSTFLAGS="-C target-feature=+neon" cargo build --target aarch64-unknown-linux-gnu - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu + - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target i686-unknown-linux-gnu - run: cargo build --target powerpc-unknown-linux-gnu - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu + - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu - name: Test release pipeline run: cargo publish --dry-run \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 3f67c2f..1f2fb71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -163,7 +163,7 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorutils-rs" -version = "0.5.1" +version = "0.5.2" dependencies = [ "erydanos", "half", diff --git a/Cargo.toml b/Cargo.toml index 5e6b360..f06bbdc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] } [package] name = "colorutils-rs" -version = "0.5.1" +version = "0.5.2" edition = "2021" description = "High performance utilities for color format handling and conversion." readme = "README.md" diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 46ea9ec..d7d0a92 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -25,15 +25,11 @@ fn main() { let g = 127; let b = 255; let rgb = Rgb::::new(r, g, b); - // let jzazbz = Jzazbz::from_rgb(rgb, TransferFunction::Srgb); - // println!("Jzczhz {:?}", jzazbz); - // println!("Rgb {:?}", rgb); - // let restored = jzazbz.to_rgb(TransferFunction::Srgb); - // println!("Restored RGB {:?}", restored); - println!( - "Restored RGB {:?}", - Jzazbz::new(0.1f32, 0.0, -0.2f32).to_rgb(TransferFunction::Srgb) - ); + let jzazbz = Jzazbz::from_rgb(rgb, TransferFunction::Srgb); + println!("Jzczhz {:?}", jzazbz); + println!("Rgb {:?}", rgb); + let restored = jzazbz.to_rgb(TransferFunction::Srgb); + println!("Restored RGB {:?}", restored); let img = ImageReader::open("./assets/beach_horizon.jpg") .unwrap() diff --git a/src/neon/image_to_jzazbz.rs b/src/neon/image_to_jzazbz.rs index b1cc72c..d442df9 100644 --- a/src/neon/image_to_jzazbz.rs +++ b/src/neon/image_to_jzazbz.rs @@ -9,7 +9,7 @@ use crate::image_to_jzazbz::JzazbzTarget; use crate::neon::get_neon_linear_transfer; use crate::neon::math::{vcolorq_matrix_f32, vpowq_n_f32}; use crate::{load_u8_and_deinterleave, TransferFunction, SRGB_TO_XYZ_D65}; -use erydanos::{vatan2q_f32, vhypotq_fast_f32, vmlafq_f32}; +use erydanos::{vatan2q_f32, vhypotq_fast_f32, visnanq_f32, vmlafq_f32, vpowq_f32}; use std::arch::aarch64::*; macro_rules! perceptual_quantizer { @@ -18,8 +18,13 @@ macro_rules! perceptual_quantizer { let xx = vpowq_n_f32(vmulq_n_f32($color, 1e-4), 0.1593017578125); let jx = vmlafq_f32(vdupq_n_f32(18.8515625), xx, vdupq_n_f32(0.8359375)); let den_jx = vmlafq_f32(xx, vdupq_n_f32(18.6875), vdupq_n_f32(1.)); - let rs = vpowq_n_f32(vdivq_f32(jx, den_jx), 134.034375); - vbslq_f32(flush_to_zero_mask, vdupq_n_f32(0.), rs) + let rs = vpowq_f32(vdivq_f32(jx, den_jx), vdupq_n_f32(134.034375)); + let flush_nan_to_zero_mask = visnanq_f32(rs); + vbslq_f32( + vorrq_u32(flush_to_zero_mask, flush_nan_to_zero_mask), + vdupq_n_f32(0.), + rs, + ) }}; } diff --git a/src/neon/jzazbz_to_image.rs b/src/neon/jzazbz_to_image.rs index 813e805..da4bfc2 100644 --- a/src/neon/jzazbz_to_image.rs +++ b/src/neon/jzazbz_to_image.rs @@ -7,7 +7,7 @@ use std::arch::aarch64::*; -use erydanos::{vcosq_f32, vmlafq_f32, vsinq_f32}; +use erydanos::{vcosq_f32, visnanq_f32, vmlafq_f32, vpowq_f32, vsinq_f32}; use crate::image::ImageConfiguration; use crate::image_to_jzazbz::JzazbzTarget; @@ -21,10 +21,13 @@ macro_rules! perceptual_quantizer_inverse { let xx = vpowq_n_f32($color, 7.460772656268214e-03); let num = vsubq_f32(vdupq_n_f32(0.8359375), xx); let den = vmlafq_f32(xx, vdupq_n_f32(18.6875), vdupq_n_f32(-18.8515625)); - let den_is_zero = vceqzq_f32(den); - let rs = vmulq_n_f32(vpowq_n_f32(vdivq_f32(num, den), 6.277394636015326), 1e4); + let rs = vmulq_n_f32( + vpowq_f32(vdivq_f32(num, den), vdupq_n_f32(6.277394636015326)), + 1e4, + ); + let flush_nan_mask = visnanq_f32(rs); vbslq_f32( - vorrq_u32(flush_to_zero_mask, den_is_zero), + vorrq_u32(flush_to_zero_mask, flush_nan_mask), vdupq_n_f32(0.), rs, ) diff --git a/src/rgb.rs b/src/rgb.rs index 4542c61..f1bcd2c 100644 --- a/src/rgb.rs +++ b/src/rgb.rs @@ -8,7 +8,7 @@ use crate::euclidean::EuclideanDistance; use crate::hsv::Hsv; use crate::lab::Lab; use crate::luv::Luv; -use crate::{Hsl, Jzazbz, LCh, Sigmoidal, TransferFunction, Xyz}; +use crate::{Hsl, Jzazbz, LCh, Oklab, Sigmoidal, TransferFunction, Xyz}; use erydanos::Euclidean3DDistance; #[derive(Debug, PartialOrd, PartialEq, Clone, Copy)] @@ -98,6 +98,15 @@ impl Rgb { ) } + /// Converts rgb to *Oklab* + /// + /// # Arguments + /// `transfer_function` - Transfer function to convert into linear colorspace and backwards + #[inline] + pub fn to_oklab(&self, transfer_function: TransferFunction) -> Oklab { + Oklab::from_rgb(*self, transfer_function) + } + /// Converts rgb to S-shaped sigmoidized components #[inline] pub fn to_sigmoidal(&self) -> Sigmoidal { diff --git a/src/sse/image_to_jzazbz.rs b/src/sse/image_to_jzazbz.rs index 5740d66..40af61d 100644 --- a/src/sse/image_to_jzazbz.rs +++ b/src/sse/image_to_jzazbz.rs @@ -10,7 +10,7 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use erydanos::{_mm_atan2_ps, _mm_hypot_fast_ps, _mm_mlaf_ps}; +use erydanos::{_mm_atan2_ps, _mm_hypot_fast_ps, _mm_isnan_ps, _mm_mlaf_ps, _mm_pow_ps}; use crate::image::ImageConfiguration; use crate::image_to_jzazbz::JzazbzTarget; @@ -30,8 +30,13 @@ macro_rules! perceptual_quantizer { let xx = _mm_pow_n_ps(_mm_mul_ps($color, _mm_set1_ps(1e-4)), 0.1593017578125); let jx = _mm_mlaf_ps(_mm_set1_ps(18.8515625), xx, _mm_set1_ps(0.8359375)); let den_jx = _mm_mlaf_ps(xx, _mm_set1_ps(18.6875), _mm_set1_ps(1.)); - let rs = _mm_pow_n_ps(_mm_div_ps(jx, den_jx), 134.034375); - _mm_select_ps(flush_to_zero_mask, zeros, rs) + let rs = _mm_pow_ps(_mm_div_ps(jx, den_jx), _mm_set1_ps(134.034375)); + let flush_nan_to_zero_mask = _mm_isnan_ps(rs); + _mm_select_ps( + _mm_or_ps(flush_to_zero_mask, flush_nan_to_zero_mask), + zeros, + rs, + ) }}; } diff --git a/src/sse/jzazbz_to_image.rs b/src/sse/jzazbz_to_image.rs index 21376b8..725be59 100644 --- a/src/sse/jzazbz_to_image.rs +++ b/src/sse/jzazbz_to_image.rs @@ -10,7 +10,7 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use erydanos::{_mm_cos_ps, _mm_mlaf_ps, _mm_sin_ps}; +use erydanos::{_mm_cos_ps, _mm_isnan_ps, _mm_mlaf_ps, _mm_pow_ps, _mm_sin_ps}; use crate::image::ImageConfiguration; use crate::image_to_jzazbz::JzazbzTarget; @@ -30,12 +30,16 @@ macro_rules! perceptual_quantizer_inverse { let xx = _mm_pow_n_ps($color, 7.460772656268214e-03); let num = _mm_sub_ps(_mm_set1_ps(0.8359375), xx); let den = _mm_mlaf_ps(xx, _mm_set1_ps(18.6875), _mm_set1_ps(-18.8515625)); - let den_is_zero = _mm_cmpeq_ps(den, zeros); let rs = _mm_mul_ps( - _mm_pow_n_ps(_mm_div_ps(num, den), 6.277394636015326), + _mm_pow_ps(_mm_div_ps(num, den), _mm_set1_ps(6.277394636015326)), _mm_set1_ps(1e4), ); - _mm_select_ps(_mm_or_ps(flush_to_zero_mask, den_is_zero), zeros, rs) + let flush_nan_to_zero_mask = _mm_isnan_ps(rs); + _mm_select_ps( + _mm_or_ps(flush_to_zero_mask, flush_nan_to_zero_mask), + zeros, + rs, + ) }}; } diff --git a/src/sse/routines.rs b/src/sse/routines.rs index 4104cef..e08e170 100644 --- a/src/sse/routines.rs +++ b/src/sse/routines.rs @@ -4,7 +4,6 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ -use crate::sse::{sse_interleave_rgb_epi16, sse_interleave_rgba_epi16}; #[macro_export] macro_rules! load_u8_and_deinterleave { @@ -152,4 +151,4 @@ macro_rules! store_and_interleave_v3_u16 { _mm_storeu_si128($ptr.add(8) as *mut __m128i, rgba1); _mm_storeu_si128($ptr.add(16) as *mut __m128i, rgba2); }}; -} \ No newline at end of file +} diff --git a/src/sse/support.rs b/src/sse/support.rs index c28b2fd..b75f96e 100644 --- a/src/sse/support.rs +++ b/src/sse/support.rs @@ -34,26 +34,6 @@ pub unsafe fn sse_interleave_rgba( (rgba_0_lo, rgba_0_hi, rgba_1_lo, rgba_1_hi) } -#[inline(always)] -pub unsafe fn sse_transpose_x4( - r: __m128, - g: __m128, - b: __m128, - a: __m128, -) -> (__m128, __m128, __m128, __m128) { - let t0 = _mm_castps_si128(_mm_unpacklo_ps(r, g)); - let t1 = _mm_castps_si128(_mm_unpacklo_ps(b, a)); - let t2 = _mm_castps_si128(_mm_unpackhi_ps(r, g)); - let t3 = _mm_castps_si128(_mm_unpackhi_ps(b, a)); - - let row1 = _mm_castsi128_ps(_mm_unpacklo_epi64(t0, t1)); - let row2 = _mm_castsi128_ps(_mm_unpackhi_epi64(t0, t1)); - let row3 = _mm_castsi128_ps(_mm_unpacklo_epi64(t2, t3)); - let row4 = _mm_castsi128_ps(_mm_unpackhi_epi64(t2, t3)); - - (row1, row2, row3, row4) -} - #[inline(always)] pub unsafe fn sse_interleave_ps_rgb(a: __m128, b: __m128, c: __m128) -> (__m128, __m128, __m128) { const MASK_U0: i32 = shuffle(0, 0, 0, 0); @@ -95,15 +75,6 @@ pub unsafe fn sse_interleave_ps_rgba( (v0, v1, v2, v3) } -#[inline(always)] -pub unsafe fn sse_store_rgba(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i, a: __m128i) { - let (row1, row2, row3, row4) = sse_interleave_rgba(r, g, b, a); - _mm_storeu_si128(ptr as *mut __m128i, row1); - _mm_storeu_si128(ptr.add(16) as *mut __m128i, row2); - _mm_storeu_si128(ptr.add(32) as *mut __m128i, row3); - _mm_storeu_si128(ptr.add(48) as *mut __m128i, row4); -} - #[inline(always)] pub unsafe fn sse_deinterleave_rgba( rgba0: __m128i, @@ -308,14 +279,6 @@ pub unsafe fn sse_deinterleave_rgb_epi16( (a0, b0, c0) } -#[inline(always)] -pub unsafe fn sse_store_rgb_u8(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i) { - let (v0, v1, v2) = sse_interleave_rgb(r, g, b); - _mm_storeu_si128(ptr as *mut __m128i, v0); - _mm_storeu_si128(ptr.add(16) as *mut __m128i, v1); - _mm_storeu_si128(ptr.add(32) as *mut __m128i, v2); -} - #[inline(always)] pub unsafe fn sse_deinterleave_rgba_ps( t0: __m128,