diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 61a5d61..b707b7a 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -1,9 +1,6 @@ -use std::arch::aarch64::{vdupq_n_f32, vdupq_n_u32, vgetq_lane_f32, vgetq_lane_u32}; use colorutils_rs::*; use image::io::Reader as ImageReader; use image::{EncodableLayout, GenericImageView}; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; use std::time::Instant; #[cfg(target_arch = "x86_64")] @@ -36,31 +33,64 @@ fn main() { // println!("Cbrt {}", l); // } - let rgb = Rgb::::new(140, 164, 177); + let r = 140; + let g = 164; + let b = 177; + let rgb = Rgb::::new(r, g, b); let hsl = rgb.to_hsl(); println!("RGB {:?}", rgb); println!("HSL {:?}", hsl); println!("Back RGB {:?}", hsl.to_rgb8()); - // unsafe { - // let (h, s, l) = neon_rgb_to_hsl(vdupq_n_u32(255), vdupq_n_u32(156), vdupq_n_u32(255), vdupq_n_f32(1f32)); - // println!("NEON HSL {}, {}, {}", vgetq_lane_f32::<0>(h), vgetq_lane_f32::<0>(s), vgetq_lane_f32::<0>(l)); - // let (r1, g1, b1) = neon_hsl_to_rgb(h, s, l, vdupq_n_f32(1f32)); + // unsafe { + // let (h, s, l) = sse_rgb_to_hsl( + // _mm_set1_epi32(r as i32), + // _mm_set1_epi32(g as i32), + // _mm_set1_epi32(b as i32), + // _mm_set1_ps(1f32), + // ); + // println!( + // "NEON HSL {}, {}, {}", + // f32::from_bits(_mm_extract_ps::<0>(h) as u32), + // f32::from_bits(_mm_extract_ps::<0>(s) as u32), + // f32::from_bits(_mm_extract_ps::<0>(l) as u32) + // ); + // let (r1, g1, b1) = sse_hsl_to_rgb(h, s, l, _mm_set1_ps(1f32)); // - // println!("NEON HSL -> RHB {}, {}, {}", vgetq_lane_u32::<0>(r1), vgetq_lane_u32::<0>(g1), vgetq_lane_u32::<0>(b1)); + // println!( + // "NEON HSL -> RGB {}, {}, {}", + // _mm_extract_epi32::<0>(r1), + // _mm_extract_epi32::<0>(g1), + // _mm_extract_epi32::<0>(b1) + // ); // } // - // unsafe { - // let (h, s, v) = neon_rgb_to_hsv(vdupq_n_u32(255), vdupq_n_u32(156), vdupq_n_u32(255), vdupq_n_f32(1f32)); + // unsafe { + // let (h, s, v) = sse_rgb_to_hsv( + // _mm_set1_epi32(r as i32), + // _mm_set1_epi32(g as i32), + // _mm_set1_epi32(b as i32), + // _mm_set1_ps(1f32), + // ); // let hsv = rgb.to_hsv(); // println!("HSV {:?}", hsv); - // println!("NEON HSV {}, {}, {}", vgetq_lane_f32::<0>(h), vgetq_lane_f32::<0>(s), vgetq_lane_f32::<0>(v)); - // let (r1, g1, b1) = neon_hsv_to_rgb(h, s,v, vdupq_n_f32(1f32)); - // println!("NEON RGB {}, {}, {}", vgetq_lane_u32::<0>(r1), vgetq_lane_u32::<0>(g1), vgetq_lane_u32::<0>(b1)); - + // println!("HSV->RBB {:?}", hsv.to_rgb8()); + // println!( + // "NEON HSV {}, {}, {}", + // f32::from_bits(_mm_extract_ps::<0>(h) as u32), + // f32::from_bits(_mm_extract_ps::<0>(s) as u32), + // f32::from_bits(_mm_extract_ps::<0>(v) as u32) + // ); + // let (r1, g1, b1) = sse_hsv_to_rgb(h, s, v, _mm_set1_ps(1f32)); + // println!( + // "NEON RGB {}, {}, {}", + // _mm_extract_epi32::<0>(r1), + // _mm_extract_epi32::<0>(g1), + // _mm_extract_epi32::<0>(b1) + // ); // } - let img = ImageReader::open("./assets/asset_middle.jpg") + let img = ImageReader::open("./assets/asset.jpg") .unwrap() .decode() .unwrap(); @@ -71,37 +101,42 @@ fn main() { let mut src_bytes = img.as_bytes(); let width = dimensions.0; let height = dimensions.1; - let components = 4; - - let mut dst_rgba = vec![]; - dst_rgba.resize(4usize * width as usize * height as usize, 0u8); - rgb_to_rgba( - &src_bytes, - 3u32 * width, - &mut dst_rgba, - 4u32 * width, - width, - height, - 255, - ); - src_bytes = &dst_rgba; + let components = 3; + + // let mut dst_rgba = vec![]; + // dst_rgba.resize(4usize * width as usize * height as usize, 0u8); + // rgb_to_rgba( + // &src_bytes, + // 3u32 * width, + // &mut dst_rgba, + // 4u32 * width, + // width, + // height, + // 255, + // ); + // src_bytes = &dst_rgba; let mut dst_slice: Vec = Vec::new(); - dst_slice.resize(width as usize * 4 * height as usize, 0u8); + dst_slice.resize(width as usize * components * height as usize, 0u8); { let mut lab_store: Vec = vec![]; - let store_stride = width as usize * 4usize * std::mem::size_of::(); - lab_store.resize(width as usize * 4usize * height as usize, 0u16); + let store_stride = width as usize * components * std::mem::size_of::(); + lab_store.resize(width as usize * components * height as usize, 0u16); + let src_stride = width * components as u32; let start_time = Instant::now(); - rgba_to_hsl( + rgb_to_hsl( src_bytes, - 4u32 * width, + src_stride, &mut lab_store, store_stride as u32, width, - height,100f32 + height, + 100f32, ); + let elapsed_time = start_time.elapsed(); + // Print the elapsed time in milliseconds + println!("RGBA To HSV: {:.2?}", elapsed_time); // let mut destination: Vec = vec![]; // destination.resize(width as usize * height as usize * 4, 0f32); // let dst_stride = width * 4 * std::mem::size_of::() as u32; @@ -124,18 +159,20 @@ fn main() { // src_shift += src_stride as usize; // } - hsl_to_rgba( + let start_time = Instant::now(); + hsl_to_rgb( &lab_store, store_stride as u32, &mut dst_slice, - 4u32 * width, + src_stride, width, - height,100f32, + height, + 100f32, ); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds - println!("Fast image resize: {:.2?}", elapsed_time); + println!("HSV To RGBA: {:.2?}", elapsed_time); // laba_to_srgb( // &lab_store, diff --git a/src/avx/avx2_to_xyz_lab.rs b/src/avx/avx2_to_xyz_lab.rs index 28b2c56..793f66d 100644 --- a/src/avx/avx2_to_xyz_lab.rs +++ b/src/avx/avx2_to_xyz_lab.rs @@ -4,8 +4,6 @@ use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; #[allow(unused_imports)] use crate::image_to_xyz_lab::XyzTarget; -#[allow(unused_imports)] -use crate::neon_gamma_curves::*; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] diff --git a/src/hsv_to_image.rs b/src/hsv_to_image.rs index d7e1e73..2e95d4b 100644 --- a/src/hsv_to_image.rs +++ b/src/hsv_to_image.rs @@ -1,9 +1,15 @@ use std::slice; -use crate::{Hsl, Hsv}; use crate::image::ImageConfiguration; use crate::image_to_hsv_support::HsvTarget; +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] use crate::neon::neon_hsv_u16_to_image; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +use crate::sse::sse_hsv_u16_to_image; +use crate::{Hsl, Hsv}; #[inline(always)] fn hsv_u16_to_channels< @@ -27,6 +33,17 @@ fn hsv_u16_to_channels< } } + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + let mut _has_sse = false; + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + if is_x86_feature_detected!("sse4.1") { + _has_sse = true; + } + let mut src_offset = 0usize; let mut dst_offset = 0usize; @@ -36,7 +53,22 @@ fn hsv_u16_to_channels< for _ in 0..height as usize { #[allow(unused_mut)] - let mut cx = 0usize; + let mut _cx = 0usize; + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + unsafe { + if _has_sse { + _cx = sse_hsv_u16_to_image::( + _cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + scale, + ) + } + } #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), @@ -60,7 +92,7 @@ fn hsv_u16_to_channels< let src_slice = unsafe { slice::from_raw_parts(src_ptr, width as usize * channels) }; let dst_slice = unsafe { slice::from_raw_parts_mut(dst_ptr, width as usize * channels) }; - for x in cx..width as usize { + for x in _cx..width as usize { let px = x * channels; let h = unsafe { *src_slice.get_unchecked(px) }; let s = unsafe { *src_slice.get_unchecked(px + 1) }; diff --git a/src/image_to_hsv.rs b/src/image_to_hsv.rs index 2c4d3bc..40962f1 100644 --- a/src/image_to_hsv.rs +++ b/src/image_to_hsv.rs @@ -8,8 +8,10 @@ use crate::image_to_hsv_support::HsvTarget; ))] use crate::neon::neon_channels_to_hsv_u16; use crate::Rgb; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +use crate::sse::sse_channels_to_hsv_u16; -#[inline(always)] +#[inline] fn channels_to_hsv_u16< const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, @@ -31,6 +33,17 @@ fn channels_to_hsv_u16< } } + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + let mut _has_sse = false; + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + if is_x86_feature_detected!("sse4.1") { + _has_sse = true; + } + let mut src_offset = 0usize; let mut dst_offset = 0usize; @@ -40,6 +53,21 @@ fn channels_to_hsv_u16< #[allow(unused_mut)] let mut cx = 0usize; + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + unsafe { + if _has_sse { + cx = sse_channels_to_hsv_u16::( + cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + scale, + ) + } + } + #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" diff --git a/src/lib.rs b/src/lib.rs index 3be5f67..3ef571c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,8 +79,4 @@ pub use hsv_to_image::*; pub use image_to_linear_u8::*; pub use linear_to_image_u8::*; -pub use rgb_expand::*; -pub use neon::neon_rgb_to_hsv; -pub use neon::neon_rgb_to_hsl; -pub use neon::neon_hsv_to_rgb; -pub use neon::neon_hsl_to_rgb; \ No newline at end of file +pub use rgb_expand::*; \ No newline at end of file diff --git a/src/luv.rs b/src/luv.rs index e72aa88..6713263 100644 --- a/src/luv.rs +++ b/src/luv.rs @@ -49,7 +49,6 @@ const D65_XYZ: [f32; 3] = [95.047f32, 100.0f32, 108.883f32]; use crate::rgb::Rgb; use crate::rgba::Rgba; use crate::xyz::Xyz; -use clap::Parser; pub(crate) const LUV_WHITE_U_PRIME: f32 = 4.0f32 * D65_XYZ[1] / (D65_XYZ[0] + 15.0 * D65_XYZ[1] + 3.0 * D65_XYZ[2]); diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 6c33638..007729d 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -1,9 +1,17 @@ +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] mod neon_colors; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" ))] mod neon_gamma_curves; +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] mod neon_image_to_hsv; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), diff --git a/src/neon/neon_colors.rs b/src/neon/neon_colors.rs index 4447b09..5c9e423 100644 --- a/src/neon/neon_colors.rs +++ b/src/neon/neon_colors.rs @@ -147,10 +147,6 @@ pub unsafe fn neon_hsv_to_rgb( (vcvtaq_u32_f32(r), vcvtaq_u32_f32(g), vcvtaq_u32_f32(b)) } -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] #[inline(always)] pub unsafe fn neon_rgb_to_hsv( r: uint32x4_t, @@ -209,10 +205,6 @@ pub unsafe fn neon_rgb_to_hsv( (h, vmulq_f32(s, scale), vmulq_f32(v, scale)) } -#[cfg(all( - any(target_arch = "aarch64", target_arch = "arm"), - target_feature = "neon" -))] #[inline(always)] pub unsafe fn neon_rgb_to_hsl( r: uint32x4_t, diff --git a/src/neon/neon_hsv_to_image.rs b/src/neon/neon_hsv_to_image.rs index 0862881..cc1e0fb 100644 --- a/src/neon/neon_hsv_to_image.rs +++ b/src/neon/neon_hsv_to_image.rs @@ -8,7 +8,7 @@ use crate::image_to_hsv_support::HsvTarget; any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" ))] -#[inline(always)] +#[inline] pub unsafe fn neon_hsv_u16_to_image< const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 098a10d..0e4cc9b 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -29,6 +29,10 @@ mod sse_xyza_laba_to_image; mod sse_color; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] mod sse_xyz_lab_to_image; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +mod sse_image_to_hsv; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +mod sse_hsv_to_image; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] pub use sse_image_to_linear_u8::*; @@ -49,4 +53,8 @@ pub use sse_xyza_laba_to_image::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] pub use sse_xyz_lab_to_image::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -pub use sse_linear_to_image::*; \ No newline at end of file +pub use sse_linear_to_image::*; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub use sse_image_to_hsv::*; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub use sse_hsv_to_image::*; \ No newline at end of file diff --git a/src/sse/sse_color.rs b/src/sse/sse_color.rs index 4c24f8a..77db068 100644 --- a/src/sse/sse_color.rs +++ b/src/sse/sse_color.rs @@ -4,7 +4,7 @@ use std::arch::x86::*; use std::arch::x86_64::*; use crate::luv::{LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME, LUV_WHITE_V_PRIME}; -use crate::sse::{_mm_cube_ps, _mm_prefer_fma_ps, _mm_select_ps}; +use crate::sse::{_mm_abs_ps, _mm_cube_ps, _mm_fmod_ps, _mm_prefer_fma_ps, _mm_select_ps}; #[inline(always)] #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -72,3 +72,304 @@ pub(crate) unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128, z = _mm_select_ps(zero_mask_2, zeros, z); (x, y, z) } + +#[inline(always)] +pub unsafe fn sse_hsl_to_rgb( + h: __m128, + s: __m128, + l: __m128, + scale: __m128, +) -> (__m128i, __m128i, __m128i) { + let s = _mm_mul_ps(s, scale); + let l = _mm_mul_ps(l, scale); + let ones = _mm_set1_ps(1f32); + let twos = _mm_set1_ps(2f32); + let c = _mm_mul_ps( + _mm_sub_ps(ones, _mm_abs_ps(_mm_sub_ps(_mm_mul_ps(l, twos), ones))), + s, + ); + let x = _mm_mul_ps( + _mm_sub_ps( + ones, + _mm_abs_ps(_mm_sub_ps( + _mm_fmod_ps(_mm_mul_ps(h, _mm_set1_ps(1f32 / 60f32)), twos), + ones, + )), + ), + c, + ); + + let zeros = _mm_setzero_ps(); + let m = _mm_sub_ps(l, _mm_mul_ps(c, _mm_set1_ps(0.5f32))); + let h_prime = h; + let (mut r, mut g, mut b) = (zeros, zeros, zeros); + + let between_zero_and_one_mask = _mm_and_ps( + _mm_cmpge_ps(h, zeros), + _mm_cmplt_ps(h_prime, _mm_set1_ps(60f32)), + ); + let between_one_and_two_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, _mm_set1_ps(60f32)), + _mm_cmplt_ps(h_prime, _mm_set1_ps(120f32)), + ); + let between_two_and_three_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, _mm_set1_ps(120f32)), + _mm_cmplt_ps(h_prime, _mm_set1_ps(180f32)), + ); + let between_three_and_four_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, _mm_set1_ps(180f32)), + _mm_cmplt_ps(h_prime, _mm_set1_ps(240f32)), + ); + let between_four_and_five_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, _mm_set1_ps(240f32)), + _mm_cmplt_ps(h_prime, _mm_set1_ps(300f32)), + ); + let between_five_and_six_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, _mm_set1_ps(300f32)), + _mm_cmplt_ps(h_prime, _mm_set1_ps(360f32)), + ); + // if h_prime >= 0f32 && h_prime < 1f32 { + r = _mm_select_ps(between_zero_and_one_mask, c, r); + g = _mm_select_ps(between_zero_and_one_mask, x, g); + // if h_prime >= 1f32 && h_prime < 2f32 { + r = _mm_select_ps(between_one_and_two_mask, x, r); + g = _mm_select_ps(between_one_and_two_mask, c, g); + // if h_prime >= 2f32 && h_prime < 3f32 + g = _mm_select_ps(between_two_and_three_mask, c, g); + b = _mm_select_ps(between_two_and_three_mask, x, b); + // if h_prime >= 3f32 && h_prime < 4f32 { + g = _mm_select_ps(between_three_and_four_mask, x, g); + b = _mm_select_ps(between_three_and_four_mask, c, b); + // if h_prime >= 4f32 && h_prime < 5f32 { + r = _mm_select_ps(between_four_and_five_mask, x, r); + b = _mm_select_ps(between_four_and_five_mask, c, b); + // if h_prime >= 5f32 && h_prime < 6f32 { + r = _mm_select_ps(between_five_and_six_mask, c, r); + b = _mm_select_ps(between_five_and_six_mask, x, b); + r = _mm_add_ps(r, m); + g = _mm_add_ps(g, m); + b = _mm_add_ps(b, m); + let rgb_scale = _mm_set1_ps(255f32); + r = _mm_mul_ps(r, rgb_scale); + g = _mm_mul_ps(g, rgb_scale); + b = _mm_mul_ps(b, rgb_scale); + const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; + ( + _mm_cvtps_epi32(_mm_round_ps::(r)), + _mm_cvtps_epi32(_mm_round_ps::(g)), + _mm_cvtps_epi32(_mm_round_ps::(b)), + ) +} + +#[inline(always)] +pub unsafe fn sse_hsv_to_rgb( + h: __m128, + s: __m128, + v: __m128, + scale: __m128, +) -> (__m128i, __m128i, __m128i) { + let s = _mm_mul_ps(s, scale); + let v = _mm_mul_ps(v, scale); + let c = _mm_mul_ps(s, v); + let h_der = _mm_mul_ps(h, _mm_set1_ps(1f32 / 60f32)); + let six = _mm_set1_ps(6f32); + let h_prime = _mm_fmod_ps(h_der, six); + let ones = _mm_set1_ps(1f32); + let x = _mm_mul_ps( + _mm_sub_ps( + ones, + _mm_abs_ps(_mm_sub_ps(_mm_fmod_ps(h_prime, _mm_set1_ps(2f32)), ones)), + ), + c, + ); + let zeros = _mm_setzero_ps(); + let m = _mm_sub_ps(v, c); + let (mut r, mut g, mut b) = (zeros, zeros, zeros); + let between_zero_and_one_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, zeros), + _mm_cmplt_ps(h_prime, ones), + ); + let twos = _mm_set1_ps(2f32); + let between_one_and_two_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, ones), + _mm_cmplt_ps(h_prime, twos), + ); + let threes = _mm_set1_ps(3f32); + let between_two_and_three_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, twos), + _mm_cmplt_ps(h_prime, threes), + ); + let fours = _mm_set1_ps(4f32); + let between_three_and_four_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, threes), + _mm_cmplt_ps(h_prime, fours), + ); + let fives = _mm_set1_ps(5f32); + let between_four_and_five_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, fours), + _mm_cmplt_ps(h_prime, fives), + ); + let between_five_and_six_mask = _mm_and_ps( + _mm_cmpge_ps(h_prime, fives), + _mm_cmplt_ps(h_prime, six), + ); + // if h_prime >= 0f32 && h_prime < 1f32 { + r = _mm_select_ps(between_zero_and_one_mask, c, r); + g = _mm_select_ps(between_zero_and_one_mask, x, g); + // if h_prime >= 1f32 && h_prime < 2f32 { + r = _mm_select_ps(between_one_and_two_mask, x, r); + g = _mm_select_ps(between_one_and_two_mask, c, g); + // if h_prime >= 2f32 && h_prime < 3f32 + g = _mm_select_ps(between_two_and_three_mask, c, g); + b = _mm_select_ps(between_two_and_three_mask, x, b); + // if h_prime >= 3f32 && h_prime < 4f32 { + g = _mm_select_ps(between_three_and_four_mask, x, g); + b = _mm_select_ps(between_three_and_four_mask, c, b); + // if h_prime >= 4f32 && h_prime < 5f32 { + r = _mm_select_ps(between_four_and_five_mask, x, r); + b = _mm_select_ps(between_four_and_five_mask, c, b); + // if h_prime >= 5f32 && h_prime < 6f32 { + r = _mm_select_ps(between_five_and_six_mask, c, r); + b = _mm_select_ps(between_five_and_six_mask, x, b); + r = _mm_add_ps(r, m); + g = _mm_add_ps(g, m); + b = _mm_add_ps(b, m); + let rgb_scale = _mm_set1_ps(255f32); + r = _mm_mul_ps(r, rgb_scale); + g = _mm_mul_ps(g, rgb_scale); + b = _mm_mul_ps(b, rgb_scale); + const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; + ( + _mm_cvtps_epi32(_mm_round_ps::(r)), + _mm_cvtps_epi32(_mm_round_ps::(g)), + _mm_cvtps_epi32(_mm_round_ps::(b)), + ) +} + +#[inline(always)] +pub unsafe fn sse_rgb_to_hsv( + r: __m128i, + g: __m128i, + b: __m128i, + scale: __m128, +) -> (__m128, __m128, __m128) { + let rgb_scale = _mm_set1_ps(1f32 / 255f32); + let r = _mm_mul_ps(_mm_cvtepi32_ps(r), rgb_scale); + let g = _mm_mul_ps(_mm_cvtepi32_ps(g), rgb_scale); + let b = _mm_mul_ps(_mm_cvtepi32_ps(b), rgb_scale); + let c_max = _mm_max_ps(_mm_max_ps(r, g), b); + let c_min = _mm_min_ps(_mm_min_ps(r, g), b); + let delta = _mm_sub_ps(c_max, c_min); + let rcp_delta = _mm_rcp_ps(delta); + let is_r_max = _mm_cmpeq_ps(c_max, r); + let is_g_max = _mm_cmpeq_ps(c_max, g); + let is_b_max = _mm_cmpeq_ps(c_max, b); + let immediate_zero_flag = _mm_cmpeq_ps(delta, _mm_setzero_ps()); + let mut h = _mm_setzero_ps(); + let v_six = _mm_set1_ps(60f32); + h = _mm_select_ps( + is_r_max, + _mm_mul_ps( + _mm_fmod_ps(_mm_mul_ps(_mm_sub_ps(g, b), rcp_delta), _mm_set1_ps(6f32)), + v_six, + ), + h, + ); + let adding_2 = _mm_set1_ps(2f32); + h = _mm_select_ps( + is_g_max, + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b, r), rcp_delta), adding_2), + v_six, + ), + h, + ); + let adding_4 = _mm_set1_ps(4f32); + h = _mm_select_ps( + is_b_max, + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(r, g), rcp_delta), adding_4), + v_six, + ), + h, + ); + let zeros = _mm_setzero_ps(); + h = _mm_select_ps(immediate_zero_flag, zeros, h); + let s = _mm_select_ps( + _mm_cmpeq_ps(c_max, zeros), + zeros, + _mm_mul_ps(delta, _mm_rcp_ps(c_max)), + ); + h = _mm_select_ps( + _mm_cmplt_ps(h, zeros), + _mm_add_ps(h, _mm_set1_ps(360f32)), + h, + ); + let v = c_max; + (h, _mm_mul_ps(s, scale), _mm_mul_ps(v, scale)) +} + +#[inline(always)] +pub unsafe fn sse_rgb_to_hsl( + r: __m128i, + g: __m128i, + b: __m128i, + scale: __m128, +) -> (__m128, __m128, __m128) { + let rgb_scale = _mm_set1_ps(1f32 / 255f32); + let r = _mm_mul_ps(_mm_cvtepi32_ps(r), rgb_scale); + let g = _mm_mul_ps(_mm_cvtepi32_ps(g), rgb_scale); + let b = _mm_mul_ps(_mm_cvtepi32_ps(b), rgb_scale); + let c_max = _mm_max_ps(_mm_max_ps(r, g), b); + let c_min = _mm_min_ps(_mm_min_ps(r, g), b); + let delta = _mm_sub_ps(c_max, c_min); + let rcp_delta = _mm_rcp_ps(delta); + let is_r_max = _mm_cmpeq_ps(c_max, r); + let is_g_max = _mm_cmpeq_ps(c_max, g); + let is_b_max = _mm_cmpeq_ps(c_max, b); + let zeros = _mm_setzero_ps(); + let immediate_zero_flag = _mm_cmpeq_ps(delta, zeros); + let v_six = _mm_set1_ps(60f32); + let mut h = _mm_setzero_ps(); + h = _mm_select_ps( + is_r_max, + _mm_mul_ps( + _mm_fmod_ps(_mm_mul_ps(_mm_sub_ps(g, b), rcp_delta), _mm_set1_ps(6f32)), + v_six, + ), + h, + ); + let adding_2 = _mm_set1_ps(2f32); + h = _mm_select_ps( + is_g_max, + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b, r), rcp_delta), adding_2), + v_six, + ), + h, + ); + let adding_4 = _mm_set1_ps(4f32); + h = _mm_select_ps( + is_b_max, + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(r, g), rcp_delta), adding_4), + v_six, + ), + h, + ); + h = _mm_select_ps(immediate_zero_flag, zeros, h); + h = _mm_select_ps( + _mm_cmplt_ps(h, zeros), + _mm_add_ps(h, _mm_set1_ps(360f32)), + h, + ); + let l = _mm_mul_ps(_mm_add_ps(c_max, c_min), _mm_set1_ps(0.5f32)); + let s = _mm_div_ps( + delta, + _mm_sub_ps( + _mm_set1_ps(1f32), + _mm_abs_ps(_mm_prefer_fma_ps(_mm_set1_ps(-1f32), _mm_set1_ps(2f32), l)), + ), + ); + (h, _mm_mul_ps(s, scale), _mm_mul_ps(l, scale)) +} diff --git a/src/sse/sse_hsv_to_image.rs b/src/sse/sse_hsv_to_image.rs new file mode 100644 index 0000000..53a207b --- /dev/null +++ b/src/sse/sse_hsv_to_image.rs @@ -0,0 +1,232 @@ +use crate::image::ImageConfiguration; +use crate::image_to_hsv_support::HsvTarget; +use crate::sse::sse_color::{sse_hsl_to_rgb, sse_hsv_to_rgb}; +use crate::sse::{ + sse_deinterleave_rgb_epi16, sse_deinterleave_rgba_epi16, sse_interleave_rgb, + sse_interleave_rgba, +}; +use std::arch::x86_64::*; + +#[inline] +pub unsafe fn sse_hsv_u16_to_image< + const CHANNELS_CONFIGURATION: u8, + const USE_ALPHA: bool, + const TARGET: u8, +>( + start_cx: usize, + src: *const u16, + src_offset: usize, + width: u32, + dst: *mut u8, + dst_offset: usize, + scale: f32, +) -> usize { + let target: HsvTarget = TARGET.into(); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let mut cx = start_cx; + if USE_ALPHA { + if !image_configuration.has_alpha() { + panic!("Use alpha flag used on image without alpha"); + } + } + + let channels = image_configuration.get_channels_count(); + + let v_scale = _mm_set1_ps(scale); + + let dst_ptr = dst.add(dst_offset); + let src_load_ptr = (src as *const u8).add(src_offset) as *const u16; + + while cx + 16 < width as usize { + let (h_chan, s_chan, v_chan, a_chan_lo); + let src_ptr = src_load_ptr.add(cx * channels); + + let row0 = _mm_loadu_si128(src_ptr as *const __m128i); + let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); + let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + + match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Bgr => { + let (h_c, s_c, v_c) = sse_deinterleave_rgb_epi16(row0, row1, row2); + h_chan = h_c; + s_chan = s_c; + v_chan = v_c; + a_chan_lo = _mm_set1_epi16(255); + } + ImageConfiguration::Rgba | ImageConfiguration::Bgra => { + let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); + let (h_c, s_c, v_c, a_c) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); + h_chan = h_c; + s_chan = s_c; + v_chan = v_c; + a_chan_lo = a_c; + } + } + + let h_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(h_chan)); + let s_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(s_chan)); + let v_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_chan)); + + let (r_low, g_low, b_low) = match target { + HsvTarget::HSV => sse_hsv_to_rgb(h_low, s_low, v_low, v_scale), + HsvTarget::HSL => sse_hsl_to_rgb(h_low, s_low, v_low, v_scale), + }; + + let zeros = _mm_setzero_si128(); + + let h_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(h_chan, zeros)); + let s_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s_chan, zeros)); + let v_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_chan, zeros)); + + let (r_high, g_high, b_high) = match target { + HsvTarget::HSV => sse_hsv_to_rgb(h_high, s_high, v_high, v_scale), + HsvTarget::HSL => sse_hsl_to_rgb(h_high, s_high, v_high, v_scale), + }; + + let r_chan_16_lo = _mm_packus_epi32(r_low, r_high); + let g_chan_16_lo = _mm_packus_epi32(g_low, g_high); + let b_chan_16_lo = _mm_packus_epi32(b_low, b_high); + + let (h_chan, s_chan, v_chan, a_chan_hi); + let src_ptr = src_load_ptr.add(cx * channels); + + let src_ptr = src_ptr.add(8 * channels); + let row0 = _mm_loadu_si128(src_ptr as *const __m128i); + let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); + let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + + match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Bgr => { + let (h_c, s_c, v_c) = sse_deinterleave_rgb_epi16(row0, row1, row2); + h_chan = h_c; + s_chan = s_c; + v_chan = v_c; + a_chan_hi = _mm_set1_epi16(255); + } + ImageConfiguration::Rgba | ImageConfiguration::Bgra => { + let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); + let (h_c, s_c, v_c, a_c) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); + h_chan = h_c; + s_chan = s_c; + v_chan = v_c; + a_chan_hi = a_c; + } + } + + let h_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(h_chan)); + let s_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(s_chan)); + let v_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_chan)); + + let (r_low, g_low, b_low) = match target { + HsvTarget::HSV => sse_hsv_to_rgb(h_low, s_low, v_low, v_scale), + HsvTarget::HSL => sse_hsl_to_rgb(h_low, s_low, v_low, v_scale), + }; + + let zeros = _mm_setzero_si128(); + + let h_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(h_chan, zeros)); + let s_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s_chan, zeros)); + let v_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_chan, zeros)); + + let (r_high, g_high, b_high) = match target { + HsvTarget::HSV => sse_hsv_to_rgb(h_high, s_high, v_high, v_scale), + HsvTarget::HSL => sse_hsl_to_rgb(h_high, s_high, v_high, v_scale), + }; + + let r_chan_16_hi = _mm_packus_epi32(r_low, r_high); + let g_chan_16_hi = _mm_packus_epi32(g_low, g_high); + let b_chan_16_hi = _mm_packus_epi32(b_low, b_high); + + let r_chan = _mm_packus_epi16(r_chan_16_lo, r_chan_16_hi); + let g_chan = _mm_packus_epi16(g_chan_16_lo, g_chan_16_hi); + let b_chan = _mm_packus_epi16(b_chan_16_lo, b_chan_16_hi); + + let ptr = dst_ptr.add(cx * channels); + if USE_ALPHA { + let a_chan = _mm_packus_epi16(a_chan_lo, a_chan_hi); + let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(r_chan, g_chan, b_chan, a_chan); + _mm_storeu_si128(ptr as *mut __m128i, rgba0); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(ptr.add(32) as *mut __m128i, rgba2); + _mm_storeu_si128(ptr.add(48) as *mut __m128i, rgba3); + } else { + let (rgba0, rgba1, rgba2) = sse_interleave_rgb(r_chan, g_chan, b_chan); + _mm_storeu_si128(ptr as *mut __m128i, rgba0); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(ptr.add(32) as *mut __m128i, rgba2); + } + + cx += 16; + } + + while cx + 8 < width as usize { + let (h_chan, s_chan, v_chan, a_chan_lo); + let src_ptr = src_load_ptr.add(cx * channels); + + let row0 = _mm_loadu_si128(src_ptr as *const __m128i); + let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); + let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + + match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Bgr => { + let (h_c, s_c, v_c) = sse_deinterleave_rgb_epi16(row0, row1, row2); + h_chan = h_c; + s_chan = s_c; + v_chan = v_c; + a_chan_lo = _mm_set1_epi16(255); + } + ImageConfiguration::Rgba | ImageConfiguration::Bgra => { + let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); + let (h_c, s_c, v_c, a_c) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); + h_chan = h_c; + s_chan = s_c; + v_chan = v_c; + a_chan_lo = a_c; + } + } + + let h_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(h_chan)); + let s_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(s_chan)); + let v_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_chan)); + + let (r_low, g_low, b_low) = match target { + HsvTarget::HSV => sse_hsv_to_rgb(h_low, s_low, v_low, v_scale), + HsvTarget::HSL => sse_hsl_to_rgb(h_low, s_low, v_low, v_scale), + }; + + let zeros = _mm_setzero_si128(); + + let h_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(h_chan, zeros)); + let s_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s_chan, zeros)); + let v_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_chan, zeros)); + + let (r_high, g_high, b_high) = match target { + HsvTarget::HSV => sse_hsv_to_rgb(h_high, s_high, v_high, v_scale), + HsvTarget::HSL => sse_hsl_to_rgb(h_high, s_high, v_high, v_scale), + }; + + let r_chan_16_lo = _mm_packus_epi32(r_low, r_high); + let g_chan_16_lo = _mm_packus_epi32(g_low, g_high); + let b_chan_16_lo = _mm_packus_epi32(b_low, b_high); + + let r_chan = _mm_packus_epi16(r_chan_16_lo, zeros); + let g_chan = _mm_packus_epi16(g_chan_16_lo, zeros); + let b_chan = _mm_packus_epi16(b_chan_16_lo, zeros); + + let ptr = dst_ptr.add(cx * channels); + if USE_ALPHA { + let a_chan = _mm_packus_epi16(a_chan_lo, _mm_setzero_si128()); + let (rgba0, rgba1, _, _) = sse_interleave_rgba(r_chan, g_chan, b_chan, a_chan); + _mm_storeu_si128(ptr as *mut __m128i, rgba0); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, rgba1); + } else { + let (rgba0, rgba1, _) = sse_interleave_rgb(r_chan, g_chan, b_chan); + _mm_storeu_si128(ptr as *mut __m128i, rgba0); + std::ptr::copy_nonoverlapping(&rgba1 as *const _ as *const u8, ptr.add(16), 8); + } + + cx += 8; + } + + cx +} diff --git a/src/sse/sse_image_to_hsv.rs b/src/sse/sse_image_to_hsv.rs new file mode 100644 index 0000000..84b9d78 --- /dev/null +++ b/src/sse/sse_image_to_hsv.rs @@ -0,0 +1,192 @@ +use crate::image::ImageConfiguration; +use crate::image_to_hsv_support::HsvTarget; +use crate::sse::sse_color::{sse_rgb_to_hsl, sse_rgb_to_hsv}; +use crate::sse::{ + sse_deinterleave_rgb, sse_deinterleave_rgba, sse_interleave_rgb_epi16, + sse_interleave_rgba_epi16, +}; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[inline] +pub unsafe fn sse_channels_to_hsv_u16< + const CHANNELS_CONFIGURATION: u8, + const USE_ALPHA: bool, + const TARGET: u8, +>( + start_cx: usize, + src: *const u8, + src_offset: usize, + width: u32, + dst: *mut u16, + dst_offset: usize, + scale: f32, +) -> usize { + let target: HsvTarget = TARGET.into(); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let mut cx = start_cx; + if USE_ALPHA { + if !image_configuration.has_alpha() { + panic!("Use alpha flag used on image without alpha"); + } + } + + let channels = image_configuration.get_channels_count(); + + let v_scale = _mm_set1_ps(scale); + + let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut u16; + + while cx + 16 < width as usize { + let (r_chan, g_chan, b_chan, a_chan); + let src_ptr = src.add(src_offset + cx * channels); + let row1 = _mm_loadu_si128(src_ptr as *const __m128i); + let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let row3 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); + match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Bgr => { + let (rgb0_, rgb1_, rgb2_) = sse_deinterleave_rgb(row1, row2, row3); + if image_configuration == ImageConfiguration::Rgb { + r_chan = rgb0_; + g_chan = rgb1_; + b_chan = rgb2_; + } else { + r_chan = rgb2_; + g_chan = rgb1_; + b_chan = rgb0_; + } + a_chan = _mm_setzero_si128(); + } + ImageConfiguration::Rgba => { + let row4 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rgb0_, rgb1_, rgb2_, rgb3_) = sse_deinterleave_rgba(row1, row2, row3, row4); + r_chan = rgb0_; + g_chan = rgb1_; + b_chan = rgb2_; + a_chan = rgb3_; + } + ImageConfiguration::Bgra => { + let row4 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rgb0_, rgb1_, rgb2_, rgb3_) = sse_deinterleave_rgba(row1, row2, row3, row4); + r_chan = rgb2_; + g_chan = rgb1_; + b_chan = rgb0_; + a_chan = rgb3_; + } + } + + let zeros = _mm_setzero_si128(); + + let r_low = _mm_unpacklo_epi8(r_chan, zeros); + let g_low = _mm_unpacklo_epi8(g_chan, zeros); + let b_low = _mm_unpacklo_epi8(b_chan, zeros); + + let r_low_low = _mm_unpacklo_epi16(r_low, zeros); + let g_low_low = _mm_unpacklo_epi16(g_low, zeros); + let b_low_low = _mm_unpacklo_epi16(b_low, zeros); + + let (x_low_low, y_low_low, z_low_low) = match target { + HsvTarget::HSV => sse_rgb_to_hsv(r_low_low, g_low_low, b_low_low, v_scale), + HsvTarget::HSL => sse_rgb_to_hsl(r_low_low, g_low_low, b_low_low, v_scale), + }; + + let a_low = _mm_unpacklo_epi8(a_chan, zeros); + + let r_low_high = _mm_unpackhi_epi16(r_low, zeros); + let g_low_high = _mm_unpackhi_epi16(g_low, zeros); + let b_low_high = _mm_unpackhi_epi16(b_low, zeros); + + let (x_low_high, y_low_high, z_low_high) = match target { + HsvTarget::HSV => sse_rgb_to_hsv(r_low_high, g_low_high, b_low_high, v_scale), + HsvTarget::HSL => sse_rgb_to_hsl(r_low_high, g_low_high, b_low_high, v_scale), + }; + + const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; + let x_low = _mm_packus_epi32( + _mm_cvtps_epi32(_mm_round_ps::(x_low_low)), + _mm_cvtps_epi32(_mm_round_ps::(x_low_high)), + ); + let y_low = _mm_packus_epi32( + _mm_cvtps_epi32(_mm_round_ps::(y_low_low)), + _mm_cvtps_epi32(_mm_round_ps::(y_low_high)), + ); + let z_low = _mm_packus_epi32( + _mm_cvtps_epi32(_mm_round_ps::(z_low_low)), + _mm_cvtps_epi32(_mm_round_ps::(z_low_high)), + ); + + if USE_ALPHA { + let (row1, row2, row3, row4) = sse_interleave_rgba_epi16(x_low, y_low, z_low, a_low); + let ptr = dst_ptr.add(cx * channels); + _mm_storeu_si128(ptr as *mut __m128i, row1); + _mm_storeu_si128(ptr.add(8) as *mut __m128i, row2); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, row3); + _mm_storeu_si128(ptr.add(24) as *mut __m128i, row4); + } else { + let (row1, row2, row3) = sse_interleave_rgb_epi16(x_low, y_low, z_low); + let ptr = dst_ptr.add(cx * channels); + _mm_storeu_si128(ptr as *mut __m128i, row1); + _mm_storeu_si128(ptr.add(8) as *mut __m128i, row2); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, row3); + } + + let r_high = _mm_unpackhi_epi8(r_chan, zeros); + let g_high = _mm_unpackhi_epi8(g_chan, zeros); + let b_high = _mm_unpackhi_epi8(b_chan, zeros); + + let r_high_low = _mm_unpacklo_epi16(r_high, zeros); + let g_high_low = _mm_unpacklo_epi16(g_high, zeros); + let b_high_low = _mm_unpacklo_epi16(b_high, zeros); + + let (x_high_low, y_high_low, z_high_low) = match target { + HsvTarget::HSV => sse_rgb_to_hsv(r_high_low, g_high_low, b_high_low, v_scale), + HsvTarget::HSL => sse_rgb_to_hsl(r_high_low, g_high_low, b_high_low, v_scale), + }; + + let a_high = _mm_unpackhi_epi8(a_chan, zeros); + + let r_high_high = _mm_unpackhi_epi16(r_high, zeros); + let g_high_high = _mm_unpackhi_epi16(g_high, zeros); + let b_high_high = _mm_unpackhi_epi16(b_high, zeros); + + let (x_high_high, y_high_high, z_high_high) = match target { + HsvTarget::HSV => sse_rgb_to_hsv(r_high_high, g_high_high, b_high_high, v_scale), + HsvTarget::HSL => sse_rgb_to_hsl(r_high_high, g_high_high, b_high_high, v_scale), + }; + + let x_high = _mm_packus_epi32( + _mm_cvtps_epi32(_mm_round_ps::(x_high_low)), + _mm_cvtps_epi32(_mm_round_ps::(x_high_high)), + ); + let y_high = _mm_packus_epi32( + _mm_cvtps_epi32(_mm_round_ps::(y_high_low)), + _mm_cvtps_epi32(_mm_round_ps::(y_high_high)), + ); + let z_high = _mm_packus_epi32( + _mm_cvtps_epi32(_mm_round_ps::(z_high_low)), + _mm_cvtps_epi32(_mm_round_ps::(z_high_high)), + ); + + if USE_ALPHA { + let (row1, row2, row3, row4) = + sse_interleave_rgba_epi16(x_high, y_high, z_high, a_high); + let ptr = dst_ptr.add(cx * channels + 8 * channels); + _mm_storeu_si128(ptr as *mut __m128i, row1); + _mm_storeu_si128(ptr.add(8) as *mut __m128i, row2); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, row3); + _mm_storeu_si128(ptr.add(24) as *mut __m128i, row4); + } else { + let (row1, row2, row3) = sse_interleave_rgb_epi16(x_high, y_high, z_high); + let ptr = dst_ptr.add(cx * channels + 8 * channels); + _mm_storeu_si128(ptr as *mut __m128i, row1); + _mm_storeu_si128(ptr.add(8) as *mut __m128i, row2); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, row3); + } + + cx += 16; + } + + cx +} diff --git a/src/sse/sse_image_to_linear_u8.rs b/src/sse/sse_image_to_linear_u8.rs index 30d8efe..57e399b 100644 --- a/src/sse/sse_image_to_linear_u8.rs +++ b/src/sse/sse_image_to_linear_u8.rs @@ -6,8 +6,6 @@ pub mod sse_image_to_linear_unsigned { use crate::image::ImageConfiguration; #[allow(unused_imports)] use crate::image_to_xyz_lab::XyzTarget; - #[allow(unused_imports)] - use crate::neon_gamma_curves::*; use crate::sse::*; #[cfg(target_arch = "x86")] use std::arch::x86::*; diff --git a/src/sse/sse_math.rs b/src/sse/sse_math.rs index ed61e1e..18db437 100644 --- a/src/sse/sse_math.rs +++ b/src/sse/sse_math.rs @@ -1,7 +1,7 @@ -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; #[cfg(target_arch = "x86")] use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] @@ -219,7 +219,11 @@ pub unsafe fn _mm_vilogbk_ps(d: __m128) -> __m128i { ); let q = _mm_sub_epi32( q, - _mm_select_si128(_mm_castps_si128(o), _mm_set1_epi32(64 + 0x7f), _mm_set1_epi32(0x7f)), + _mm_select_si128( + _mm_castps_si128(o), + _mm_set1_epi32(64 + 0x7f), + _mm_set1_epi32(0x7f), + ), ); return q; } @@ -247,6 +251,14 @@ pub(crate) unsafe fn _mm_neg_epi32(x: __m128i) -> __m128i { return _mm_sub_epi32(high, x); } +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +#[allow(dead_code)] +pub(crate) unsafe fn _mm_neg_ps(x: __m128) -> __m128 { + let high = _mm_set1_ps(0f32); + return _mm_sub_ps(high, x); +} + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] #[allow(dead_code)] @@ -268,7 +280,10 @@ pub unsafe fn _mm_cbrt_ps_ulp35(d: __m128) -> __m128 { let t = _mm_add_ps(_mm_cvtepi32_ps(e), _mm_set1_ps(6144f32)); let qu = _mm_cvttps_epi32(_mm_mul_ps(t, _mm_set1_ps(1.0f32 / 3.0f32))); - let re = _mm_cvttps_epi32(_mm_sub_ps(t, _mm_mul_ps(_mm_cvtepi32_ps(qu), _mm_set1_ps(3f32)))); + let re = _mm_cvttps_epi32(_mm_sub_ps( + t, + _mm_mul_ps(_mm_cvtepi32_ps(qu), _mm_set1_ps(3f32)), + )); q = _mm_selecti_ps( _mm_cmpeq_epi32(re, _mm_set1_epi32(1)), @@ -327,3 +342,16 @@ pub unsafe fn _mm_color_matrix_ps( let new_b = _mm_prefer_fma_ps(_mm_prefer_fma_ps(_mm_mul_ps(g, c8), b, c9), r, c7); (new_r, new_g, new_b) } + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +#[allow(dead_code)] +pub(crate) unsafe fn _mm_fmod_ps(a: __m128, b: __m128) -> __m128 { + let dividend_vec = a; + let divisor_vec = b; + let division = _mm_mul_ps(dividend_vec, _mm_rcp_ps(divisor_vec)); // Perform division + let int_part = _mm_floor_ps(division); // Get the integer part using floor + let product = _mm_mul_ps(int_part, divisor_vec); // Multiply the integer part by the divisor + let remainder = _mm_sub_ps(dividend_vec, product); // Subtract the product from the dividend + remainder +} \ No newline at end of file diff --git a/src/sse/sse_support.rs b/src/sse/sse_support.rs index 3379922..411184f 100644 --- a/src/sse/sse_support.rs +++ b/src/sse/sse_support.rs @@ -1,8 +1,8 @@ +use crate::avx::shuffle; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use crate::avx::shuffle; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] @@ -22,7 +22,6 @@ pub unsafe fn sse_interleave_even(x: __m128i) -> __m128i { return new_lane; } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] #[allow(dead_code)] @@ -66,7 +65,6 @@ pub unsafe fn sse_transpose_x4( (row1, row2, row3, row4) } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] #[allow(dead_code)] @@ -254,6 +252,95 @@ pub unsafe fn sse_interleave_rgb( (v0, v1, v2) } +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn sse_interleave_rgb_epi16( + a: __m128i, + b: __m128i, + c: __m128i, +) -> (__m128i, __m128i, __m128i) { + let sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + let sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); + let sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + let a0 = _mm_shuffle_epi8(a, sh_a); + let b0 = _mm_shuffle_epi8(b, sh_b); + let c0 = _mm_shuffle_epi8(c, sh_c); + + let v0 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(a0, b0), c0); + let v1 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(c0, a0), b0); + let v2 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(b0, c0), a0); + (v0, v1, v2) +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn sse_interleave_rgba_epi16( + a: __m128i, + b: __m128i, + c: __m128i, + d: __m128i, +) -> (__m128i, __m128i, __m128i, __m128i) { + let u0 = _mm_unpacklo_epi16(a, c); // a0 c0 a1 c1 ... + let u1 = _mm_unpackhi_epi16(a, c); // a4 c4 a5 c5 ... + let u2 = _mm_unpacklo_epi16(b, d); // b0 d0 b1 d1 ... + let u3 = _mm_unpackhi_epi16(b, d); // b4 d4 b5 d5 ... + + let v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ... + let v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... + let v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... + let v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ... + (v0, v1, v2, v3) +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn sse_deinterleave_rgba_epi16( + u0: __m128i, + u1: __m128i, + u2: __m128i, + u3: __m128i, +) -> (__m128i, __m128i, __m128i, __m128i) { + let v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ... + let v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ... + let v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ... + let v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ... + + let u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ... + let u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ... + let u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ... + let u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ... + + let a = _mm_unpacklo_epi16(u0, u1); + let b = _mm_unpackhi_epi16(u0, u1); + let c = _mm_unpacklo_epi16(u2, u3); + let d = _mm_unpackhi_epi16(u2, u3); + (a, b, c ,d) +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn sse_deinterleave_rgb_epi16( + v0: __m128i, + v1: __m128i, + v2: __m128i, +) -> (__m128i, __m128i, __m128i) { + let a0 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(v0, v1), v2); + let b0 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(v2, v0), v1); + let c0 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(v1, v2), v0); + + let sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + let sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); + let sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + let a0 = _mm_shuffle_epi8(a0, sh_a); + let b0 = _mm_shuffle_epi8(b0, sh_b); + let c0 = _mm_shuffle_epi8(c0, sh_c); + (a0, b0, c0) +} + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] #[allow(dead_code)] @@ -264,7 +351,6 @@ pub unsafe fn sse_store_rgb_u8(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i) _mm_storeu_si128(ptr.add(32) as *mut __m128i, v2); } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] #[allow(dead_code)] @@ -294,4 +380,4 @@ pub unsafe fn sse_deinterleave_rgba_ps( let v2 = _mm_unpacklo_ps(t02hi, t13hi); let v3 = _mm_unpackhi_ps(t02hi, t13hi); (v0, v1, v2, v3) -} \ No newline at end of file +} diff --git a/src/sse/sse_to_linear.rs b/src/sse/sse_to_linear.rs index b809789..c48276a 100644 --- a/src/sse/sse_to_linear.rs +++ b/src/sse/sse_to_linear.rs @@ -4,8 +4,6 @@ use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; #[allow(unused_imports)] use crate::image_to_xyz_lab::XyzTarget; -#[allow(unused_imports)] -use crate::neon_gamma_curves::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[allow(unused_imports)] use crate::sse::*; diff --git a/src/sse/sse_to_xyz_lab.rs b/src/sse/sse_to_xyz_lab.rs index 7fc35cd..1da973a 100644 --- a/src/sse/sse_to_xyz_lab.rs +++ b/src/sse/sse_to_xyz_lab.rs @@ -2,8 +2,6 @@ use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y}; #[allow(unused_imports)] -use crate::neon_gamma_curves::*; -#[allow(unused_imports)] use crate::sse::*; #[allow(unused_imports)] use crate::image_to_xyz_lab::XyzTarget; diff --git a/src/sse/sse_to_xyza_laba.rs b/src/sse/sse_to_xyza_laba.rs index 55ee60b..bbd259d 100644 --- a/src/sse/sse_to_xyza_laba.rs +++ b/src/sse/sse_to_xyza_laba.rs @@ -4,8 +4,6 @@ use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; #[allow(unused_imports)] use crate::image_to_xyz_lab::XyzTarget; -#[allow(unused_imports)] -use crate::neon_gamma_curves::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[allow(unused_imports)] use crate::sse::*;