From e84a311236265b3de80c9dec3873c5b41004e19d Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 15 Jun 2024 16:50:32 +0100 Subject: [PATCH] AVX2 fixes --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/app/src/main.rs | 83 +++---------------------------- src/avx/avx2_to_xyz_lab.rs | 4 +- src/avx/avx_gamma_curves.rs | 6 +-- src/avx/avx_math.rs | 5 -- src/avx/avx_support.rs | 64 ++++++++++++++++-------- src/avx/avx_xyz_lab_to_image.rs | 6 +-- src/avx/avx_xyza_laba_to_image.rs | 20 +++++--- src/rgb_expand.rs | 2 +- 10 files changed, 71 insertions(+), 123 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2c3d28e..76b67e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -163,7 +163,7 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorutils-rs" -version = "0.4.0" +version = "0.4.1" dependencies = [ "half", ] diff --git a/Cargo.toml b/Cargo.toml index 9cb5043..b9ebfbc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] } [package] name = "colorutils-rs" -version = "0.4.0" +version = "0.4.1" edition = "2021" description = "High performance utilities for color format handling and conversion." readme = "README.md" diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 73ce33e..271571f 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -1,8 +1,10 @@ -use colorutils_rs::*; -use image::io::Reader as ImageReader; -use image::{EncodableLayout, GenericImageView}; use std::time::Instant; +use image::{EncodableLayout, GenericImageView}; +use image::io::Reader as ImageReader; + +use colorutils_rs::*; + #[cfg(target_arch = "x86_64")] #[inline(always)] #[allow(dead_code)] @@ -12,27 +14,6 @@ pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 { } fn main() { - // #[cfg(target_arch = "x86_64")] - // unsafe { - // println!("HAS fma: {}", is_x86_feature_detected!("fma")); - // let mut dst: [f32; 4] = [0f32; 4]; - // // let src = _mm_setr_ps(0.119973198f32, 0.0428578928f32, 0.225254923f32, 27f32); - // let src = _mm_setr_ps(0.0428578928f32, 0.0428578928f32, 0.0428578928f32, 27f32); - // let rgb = Rgb::::new(0, 0, 0); - // let xyz = Xyz::from_srgb(&rgb); - // let ln = _mm_cbrt_ps(src); - // println!("X: {}, Y: {}, Z: {}", 0.119973198f32.cbrt(), 0.0428578928f32.cbrt(), 0.225254923f32.cbrt()); - // _mm_storeu_ps(dst.as_mut_ptr() as *mut f32, ln); - // println!("{:?}", dst); - // } - // #[cfg(target_arch = "aarch64")] - // unsafe { - // let m = vdupq_n_f32(27f32); - // let cbrt = vcbrtq_f32_ulp2(m); - // let l = vgetq_lane_f32::<0>(cbrt); - // println!("Cbrt {}", l); - // } - let r = 140; let g = 164; let b = 177; @@ -42,55 +23,7 @@ fn main() { println!("HSL {:?}", hsl); println!("Back RGB {:?}", hsl.to_rgb8()); - // unsafe { - // let (h, s, l) = neon_rgb_to_hsl( - // vdupq_n_u32(r as u32), - // vdupq_n_u32(g as u32), - // vdupq_n_u32(b as u32), - // vdupq_n_f32(1f32), - // ); - // println!( - // "NEON HSL {}, {}, {}", - // vgetq_lane_f32::<0>(h), - // vgetq_lane_f32::<0>(s), - // vgetq_lane_f32::<0>(l) - // ); - // let (r1, g1, b1) = neon_hsl_to_rgb(h, s, l, vdupq_n_f32(1f32)); - // - // println!( - // "NEON HSL -> RGB {}, {}, {}", - // vgetq_lane_u32::<0>(r1), - // vgetq_lane_u32::<0>(g1), - // vgetq_lane_u32::<0>(b1) - // ); - // } - // - // unsafe { - // let (h, s, v) = neon_rgb_to_hsv( - // vdupq_n_u32(r as u32), - // vdupq_n_u32(g as u32), - // vdupq_n_u32(b as u32), - // vdupq_n_f32(1f32), - // ); - // let hsv = rgb.to_hsv(); - // println!("HSV {:?}", hsv); - // println!("HSV->RBB {:?}", hsv.to_rgb8()); - // println!( - // "NEON HSV {}, {}, {}", - // vgetq_lane_f32::<0>(h), - // vgetq_lane_f32::<0>(s), - // vgetq_lane_f32::<0>(v) - // ); - // let (r1, g1, b1) = neon_hsv_to_rgb(h, s, v, vdupq_n_f32(1f32)); - // println!( - // "NEON RGB {}, {}, {}", - // vgetq_lane_u32::<0>(r1), - // vgetq_lane_u32::<0>(g1), - // vgetq_lane_u32::<0>(b1) - // ); - // } - - let img = ImageReader::open("./assets/asset.jpg") + let img = ImageReader::open("./assets/asset_middle.jpg") .unwrap() .decode() .unwrap(); @@ -125,7 +58,7 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0f32); let src_stride = width * components as u32; let start_time = Instant::now(); - rgb_to_sigmoidal( + rgb_to_lab( src_bytes, src_stride, &mut lab_store, @@ -159,7 +92,7 @@ fn main() { // } let start_time = Instant::now(); - sigmoidal_to_rgb( + lab_to_srgb( &lab_store, store_stride as u32, &mut dst_slice, diff --git a/src/avx/avx2_to_xyz_lab.rs b/src/avx/avx2_to_xyz_lab.rs index dc6b3f3..acce841 100644 --- a/src/avx/avx2_to_xyz_lab.rs +++ b/src/avx/avx2_to_xyz_lab.rs @@ -163,7 +163,7 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab< let row3 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); match image_configuration { ImageConfiguration::Rgb | ImageConfiguration::Bgr => { - let (c1, c2, c3) = avx2_deinterleave_rgb(row1, row2, row3); + let (c1, c2, c3) = avx2_deinterleave_rgb_epi8(row1, row2, row3); if image_configuration == ImageConfiguration::Rgb { r_chan = c1; g_chan = c2; @@ -177,7 +177,7 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab< } ImageConfiguration::Rgba | ImageConfiguration::Bgra => { let row4 = _mm256_loadu_si256(src_ptr.add(64 + 32) as *const __m256i); - let (c1, c2, c3, c4) = avx2_deinterleave_rgba(row1, row2, row3, row4); + let (c1, c2, c3, c4) = avx2_deinterleave_rgba_epi8(row1, row2, row3, row4); if image_configuration == ImageConfiguration::Rgba { r_chan = c1; g_chan = c2; diff --git a/src/avx/avx_gamma_curves.rs b/src/avx/avx_gamma_curves.rs index 789980f..27e234b 100644 --- a/src/avx/avx_gamma_curves.rs +++ b/src/avx/avx_gamma_curves.rs @@ -8,7 +8,6 @@ use std::arch::x86_64::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 { let low_cut_off = _mm256_set1_ps(0.0030412825601275209f32); let mask = _mm256_cmp_ps::<_CMP_GE_OS>(linear, low_cut_off); @@ -29,7 +28,6 @@ pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 { #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 { let low_cut_off = _mm256_set1_ps(12.92f32 * 0.0030412825601275209f32); let mask = _mm256_cmp_ps::<_CMP_GE_OS>(gamma, low_cut_off); @@ -48,7 +46,6 @@ pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 { #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 { let low_cut_off = _mm256_set1_ps(0.018053968510807f32); let mask = _mm256_cmp_ps::<_CMP_GE_OS>(linear, low_cut_off); @@ -69,7 +66,6 @@ pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 { #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn avx2_rec709_to_linear(linear: __m256) -> __m256 { let low_cut_off = _mm256_set1_ps(4.5f32 * 0.018053968510807f32); let mask = _mm256_cmp_ps::<_CMP_GE_OS>(linear, low_cut_off); @@ -92,7 +88,7 @@ pub unsafe fn get_avx_gamma_transfer( transfer_function: TransferFunction, ) -> unsafe fn(__m256) -> __m256 { match transfer_function { - TransferFunction::Srgb => avx2_srgb_to_linear, + TransferFunction::Srgb => avx2_srgb_from_linear, TransferFunction::Rec709 => avx2_rec709_from_linear, } } diff --git a/src/avx/avx_math.rs b/src/avx/avx_math.rs index 77d956d..3eae780 100644 --- a/src/avx/avx_math.rs +++ b/src/avx/avx_math.rs @@ -5,7 +5,6 @@ use std::arch::x86_64::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm256_cube_ps(x: __m256) -> __m256 { _mm256_mul_ps(_mm256_mul_ps(x, x), x) } @@ -13,7 +12,6 @@ pub unsafe fn _mm256_cube_ps(x: __m256) -> __m256 { #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[cfg(not(target_feature = "fma"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm256_prefer_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 { return _mm256_add_ps(_mm256_mul_ps(b, c), a); } @@ -21,14 +19,12 @@ pub unsafe fn _mm256_prefer_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 { #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[cfg(target_feature = "fma")] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm256_prefer_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 { return _mm256_fmadd_ps(b, c, a); } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] unsafe fn _mm256_taylorpoly_ps( x: __m256, poly0: __m256, @@ -56,7 +52,6 @@ unsafe fn _mm256_taylorpoly_ps( #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 { let const_ln127 = _mm256_set1_epi32(127); // 127 let const_ln2 = _mm256_set1_ps(std::f32::consts::LN_2); // ln(2) diff --git a/src/avx/avx_support.rs b/src/avx/avx_support.rs index 98ac81d..296dd45 100644 --- a/src/avx/avx_support.rs +++ b/src/avx/avx_support.rs @@ -140,7 +140,28 @@ pub unsafe fn avx2_interleave_rgb( #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] #[allow(dead_code)] -pub unsafe fn avx2_deinterleave_rgb( +pub unsafe fn avx2_deinterleave_rgb_epi32( + bgr0: __m256i, + bgr1: __m256i, + bgr2: __m256i, +) -> (__m256i, __m256i, __m256i) { + let s02_low = _mm256_permute2x128_si256::<32>(bgr0, bgr2); + let s02_high = _mm256_permute2x128_si256::<49>(bgr0, bgr2); + + let b0 = _mm256_blend_epi32::<0x92>(_mm256_blend_epi32::<0x24>(s02_low, s02_high), bgr1); + let g0 = _mm256_blend_epi32::<0x24>(_mm256_blend_epi32::<0x92>(s02_high, s02_low), bgr1); + let r0 = _mm256_blend_epi32::<0x92>(_mm256_blend_epi32::<0x24>(bgr1, s02_low), s02_high); + + let b0 = _mm256_shuffle_epi32::<0x6c>(b0); + let g0 = _mm256_shuffle_epi32::<0xb1>(g0); + let r0 = _mm256_shuffle_epi32::<0xc6>(r0); + (b0, g0, r0) +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn avx2_deinterleave_rgb_epi8( rgb0: __m256i, rgb1: __m256i, rgb2: __m256i, @@ -220,7 +241,7 @@ pub unsafe fn avx2_deinterleave_rgb_ps( rgb1: __m256, rgb2: __m256, ) -> (__m256, __m256, __m256) { - let (r, g, b) = avx2_deinterleave_rgb( + let (r, g, b) = avx2_deinterleave_rgb_epi32( _mm256_castps_si256(rgb0), _mm256_castps_si256(rgb1), _mm256_castps_si256(rgb2), @@ -243,7 +264,7 @@ pub unsafe fn avx2_reshuffle_odd(v: __m256i) -> __m256i { #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] #[allow(dead_code)] -pub unsafe fn avx2_deinterleave_rgba( +pub unsafe fn avx2_deinterleave_rgba_epi8( rgba0: __m256i, rgba1: __m256i, rgba2: __m256i, @@ -295,28 +316,27 @@ pub unsafe fn avx2_store_u8_rgb(ptr: *mut u8, r: __m256i, g: __m256i, b: __m256i #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn avx2_interleave_rgba_epi8( - r: __m256i, - g: __m256i, - b: __m256i, a: __m256i, + b: __m256i, + c: __m256i, + d: __m256i, ) -> (__m256i, __m256i, __m256i, __m256i) { - let bg0 = _mm256_unpacklo_epi8(r, g); - let bg1 = _mm256_unpackhi_epi8(r, g); - let ra0 = _mm256_unpacklo_epi8(b, a); - let ra1 = _mm256_unpackhi_epi8(b, a); - - let rgba0_ = _mm256_unpacklo_epi16(bg0, ra0); - let rgba1_ = _mm256_unpackhi_epi16(bg0, ra0); - let rgba2_ = _mm256_unpacklo_epi16(bg1, ra1); - let rgba3_ = _mm256_unpackhi_epi16(bg1, ra1); - - let rgba0 = _mm256_permute2x128_si256::<32>(rgba0_, rgba1_); - let rgba2 = _mm256_permute2x128_si256::<49>(rgba0_, rgba1_); - let rgba1 = _mm256_permute2x128_si256::<32>(rgba2_, rgba3_); - let rgba3 = _mm256_permute2x128_si256::<49>(rgba2_, rgba3_); - (rgba0, rgba1, rgba2, rgba3) + let bg0 = _mm256_unpacklo_epi8(a, b); + let bg1 = _mm256_unpackhi_epi8(a, b); + let ra0 = _mm256_unpacklo_epi8(c, d); + let ra1 = _mm256_unpackhi_epi8(c, d); + + let bgra0_ = _mm256_unpacklo_epi16(bg0, ra0); + let bgra1_ = _mm256_unpackhi_epi16(bg0, ra0); + let bgra2_ = _mm256_unpacklo_epi16(bg1, ra1); + let bgra3_ = _mm256_unpackhi_epi16(bg1, ra1); + + let bgra0 = _mm256_permute2x128_si256::<32>(bgra0_, bgra1_); + let bgra2 = _mm256_permute2x128_si256::<49>(bgra0_, bgra1_); + let bgra1 = _mm256_permute2x128_si256::<32>(bgra2_, bgra3_); + let bgra3 = _mm256_permute2x128_si256::<49>(bgra2_, bgra3_); + (bgra0, bgra1, bgra2, bgra3) } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] diff --git a/src/avx/avx_xyz_lab_to_image.rs b/src/avx/avx_xyz_lab_to_image.rs index 30e7e61..f6da6b6 100644 --- a/src/avx/avx_xyz_lab_to_image.rs +++ b/src/avx/avx_xyz_lab_to_image.rs @@ -138,7 +138,7 @@ pub unsafe fn avx_xyz_to_channels< c9, ); - let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS); + let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS); let (r_row1_, g_row1_, b_row1_) = avx_xyz_lab_vld::( @@ -155,7 +155,7 @@ pub unsafe fn avx_xyz_to_channels< c9, ); - let src_ptr_2 = offset_src_ptr.add(4 * 2 * CHANNELS); + let src_ptr_2 = offset_src_ptr.add(8 * 2 * CHANNELS); let (r_row2_, g_row2_, b_row2_) = avx_xyz_lab_vld::( @@ -172,7 +172,7 @@ pub unsafe fn avx_xyz_to_channels< c9, ); - let src_ptr_3 = offset_src_ptr.add(4 * 3 * CHANNELS); + let src_ptr_3 = offset_src_ptr.add(8 * 3 * CHANNELS); let (r_row3_, g_row3_, b_row3_) = avx_xyz_lab_vld::( diff --git a/src/avx/avx_xyza_laba_to_image.rs b/src/avx/avx_xyza_laba_to_image.rs index dd8ebdf..93c1ce8 100644 --- a/src/avx/avx_xyza_laba_to_image.rs +++ b/src/avx/avx_xyza_laba_to_image.rs @@ -5,10 +5,7 @@ use std::arch::x86_64::*; use crate::avx::avx_color::{avx_lab_to_xyz, avx_luv_to_xyz}; use crate::avx::avx_gamma_curves::get_avx_gamma_transfer; -use crate::avx::{ - _mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8, avx2_pack_s32, - avx2_pack_u16, -}; +use crate::avx::{_mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8, avx2_pack_s32, avx2_pack_u16}; use crate::image::ImageConfiguration; use crate::image_to_xyz_lab::XyzTarget; use crate::TransferFunction; @@ -127,7 +124,7 @@ pub unsafe fn avx_xyza_to_image( src_ptr_1, @@ -143,7 +140,7 @@ pub unsafe fn avx_xyza_to_image( src_ptr_2, @@ -159,7 +156,7 @@ pub unsafe fn avx_xyza_to_image( src_ptr_3, @@ -192,7 +189,14 @@ pub unsafe fn avx_xyza_to_image { + avx2_interleave_rgba_epi8(r_row, g_row, b_row, a_row) + } + ImageConfiguration::Bgra | ImageConfiguration::Bgr => { + avx2_interleave_rgba_epi8(b_row, g_row, r_row, a_row) + } + }; _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); diff --git a/src/rgb_expand.rs b/src/rgb_expand.rs index 2e4fc9d..12026cc 100644 --- a/src/rgb_expand.rs +++ b/src/rgb_expand.rs @@ -79,7 +79,7 @@ pub fn rgb_to_rgba( let xyz0 = _mm256_loadu_si256(xyz_chan_ptr as *const __m256i); let xyz1 = _mm256_loadu_si256(xyz_chan_ptr.add(32) as *const __m256i); let xyz2 = _mm256_loadu_si256(xyz_chan_ptr.add(64) as *const __m256i); - let (x_p, y_p, z_p) = avx2_deinterleave_rgb(xyz0, xyz1, xyz2); + let (x_p, y_p, z_p) = avx2_deinterleave_rgb_epi8(xyz0, xyz1, xyz2); let xyza_chan_ptr = dst_ptr.add(cx * 4usize);