diff --git a/Cargo.lock b/Cargo.lock index 1789089..9c24aae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -94,15 +94,15 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitstream-io" -version = "2.3.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c12d1856e42f0d817a835fe55853957c85c8c8a470114029143d3f12671446e" +checksum = "3dcde5f311c85b8ca30c2e4198d4326bc342c76541590106f5fa4a50946ea499" [[package]] name = "built" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6a6c0b39c38fd754ac338b00a88066436389c0f029da5d37d1e01091d9b7c17" +checksum = "236e6289eda5a812bc6b53c3b024039382a2895fbbeef2d748b2931546d392c4" [[package]] name = "bumpalo" @@ -112,9 +112,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.16.0" +version = "1.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5" +checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" [[package]] name = "byteorder" @@ -130,9 +130,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" [[package]] name = "cc" -version = "1.0.98" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f" +checksum = "eaff6f8ce506b9773fa786672d63fc7a191ffea1be33f72bbd4aeacefca9ffc8" dependencies = [ "jobserver", "libc", @@ -163,7 +163,7 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorutils-rs" -version = "0.4.12" +version = "0.4.13" dependencies = [ "erydanos", "half", @@ -211,9 +211,9 @@ checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" [[package]] name = "either" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "equivalent" @@ -223,9 +223,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "erydanos" -version = "0.1.0" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b0354c3359e57ded8b5f8a120273cb1da304630399da59afa14182404573d6f" +checksum = "1a140744bdb5b8777d9714a8d6a72c5e58d4eb2b0c3c8a85c8bada86efd9fa21" dependencies = [ "num-traits", ] @@ -436,9 +436,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.21" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "loop9" @@ -461,9 +461,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "minimal-lexical" @@ -473,9 +473,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", "simd-adler32", @@ -505,9 +505,9 @@ checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" [[package]] name = "num-bigint" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c165a9ab64cf766f73521c0dd2cfdff64f488b8f0b3e621face3462d3db536d7" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", @@ -592,9 +592,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.84" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] @@ -709,9 +709,9 @@ dependencies = [ [[package]] name = "ravif" -version = "0.11.5" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc13288f5ab39e6d7c9d501759712e6969fcc9734220846fc9ed26cae2cc4234" +checksum = "c6ba61c28ba24c0cf8406e025cb29a742637e3f70776e61c27a8a8b72a042d12" dependencies = [ "avif-serialize", "imgref", @@ -744,9 +744,9 @@ dependencies = [ [[package]] name = "rgb" -version = "0.8.37" +version = "0.8.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05aaa8004b64fd573fc9d002f4e632d51ad4f026c2b5ba95fcb6c2f32c2c47d8" +checksum = "1aee83dc281d5a3200d37b299acd13b81066ea126a7f16f0eae70fc9aed241d9" dependencies = [ "bytemuck", ] @@ -759,18 +759,18 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "serde" -version = "1.0.203" +version = "1.0.204" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.203" +version = "1.0.204" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" dependencies = [ "proc-macro2", "quote", @@ -818,9 +818,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.66" +version = "2.0.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5" +checksum = "2f0209b68b3613b093e0ec905354eccaedcfe83b8cb37cbdeae64026c3064c16" dependencies = [ "proc-macro2", "quote", @@ -842,9 +842,9 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.12.14" +version = "0.12.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" +checksum = "4873307b7c257eddcb50c9bedf158eb669578359fb28428bef438fec8e6ba7c2" [[package]] name = "thiserror" @@ -879,9 +879,9 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.13" +version = "0.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4e43f8cc456c9704c851ae29c67e17ef65d2c30017c17a9765b89c382dc8bba" +checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" dependencies = [ "serde", "serde_spanned", @@ -900,9 +900,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.13" +version = "0.22.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c127785850e8c20836d49732ae6abfa47616e60bf9d9f57c43c250361a9db96c" +checksum = "d59a3a72298453f564e2b111fa896f8d07fabb36f51f06d7e875fc5e0b5a3ef1" dependencies = [ "indexmap", "serde", @@ -1002,9 +1002,9 @@ checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082" [[package]] name = "winnow" -version = "0.6.9" +version = "0.6.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86c949fede1d13936a99f14fafd3e76fd642b556dd2ce96287fbe2e0151bfac6" +checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 35950dd..d0d17d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] } [package] name = "colorutils-rs" -version = "0.4.12" +version = "0.4.13" edition = "2021" description = "High performance utilities for color format handling and conversion." readme = "README.md" @@ -16,7 +16,7 @@ repository = "https://github.com/awxkee/colorutils-rs" exclude = ["*.jpg"] [dependencies] -erydanos = "0.1.0" +erydanos = "0.2.3" half = "2.4.1" [features] diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 9bbdfae..5992c39 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -58,7 +58,7 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0f32); let src_stride = width * components as u32; let start_time = Instant::now(); - rgb_to_lab( + rgb_to_lch( src_bytes, src_stride, &mut lab_store, @@ -92,7 +92,7 @@ fn main() { // } let start_time = Instant::now(); - lab_to_srgb( + lch_to_rgb( &lab_store, store_stride as u32, &mut dst_slice, diff --git a/src/linear_to_planar.rs b/src/linear_to_planar.rs index ba90d4e..2b5a9c8 100644 --- a/src/linear_to_planar.rs +++ b/src/linear_to_planar.rs @@ -3,6 +3,11 @@ target_feature = "neon" ))] use crate::neon::linear_to_planar::neon_linear_plane_to_gamma; +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" +))] +use crate::sse::sse_linear_plane_to_gamma; use crate::TransferFunction; #[inline(always)] @@ -20,6 +25,20 @@ fn linear_to_gamma_channels( let transfer = transfer_function.get_gamma_function(); + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + let mut _has_sse = false; + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + if is_x86_feature_detected!("sse4.1") { + _has_sse = true; + } + for _ in 0..height as usize { let mut _cx = 0usize; @@ -39,6 +58,24 @@ fn linear_to_gamma_channels( ); } + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + _cx = sse_linear_plane_to_gamma( + _cx, + src.as_ptr(), + src_offset as u32, + dst.as_mut_ptr(), + dst_offset as u32, + width, + transfer_function, + ); + } + } + let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *const f32 }; let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) }; diff --git a/src/neon/cie.rs b/src/neon/cie.rs index 4630546..32d5fe1 100644 --- a/src/neon/cie.rs +++ b/src/neon/cie.rs @@ -3,12 +3,8 @@ use crate::luv::{ LUV_WHITE_V_PRIME, }; use crate::neon::math::{prefer_vfmaq_f32, vcolorq_matrix_f32, vcubeq_f32}; -use erydanos::neon::atan2f::vatan2q_f32; -use erydanos::neon::cbrtf::vcbrtq_f32; -use erydanos::neon::cosf::vcosq_f32; -use erydanos::neon::hypotf::vhypotq_fast_f32; -use erydanos::neon::sinf::vsinq_f32; use std::arch::aarch64::*; +use erydanos::{vatan2q_f32, vcbrtq_f32, vcosq_f32, vhypotq_fast_f32, vsinq_f32}; #[inline(always)] pub(crate) unsafe fn neon_triple_to_xyz( diff --git a/src/neon/math.rs b/src/neon/math.rs index 2852e44..b1a7ad4 100644 --- a/src/neon/math.rs +++ b/src/neon/math.rs @@ -1,6 +1,5 @@ use std::arch::aarch64::*; - -use erydanos::neon::powf::vpowq_fast_f32; +use erydanos::vpowq_fast_f32; #[inline(always)] #[allow(dead_code)] diff --git a/src/planar_to_linear.rs b/src/planar_to_linear.rs index 7bf4672..380c0d8 100644 --- a/src/planar_to_linear.rs +++ b/src/planar_to_linear.rs @@ -3,6 +3,11 @@ target_feature = "neon" ))] use crate::neon::planar_to_linear::neon_plane_to_linear; +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" +))] +use crate::sse::sse_plane_to_linear; use crate::TransferFunction; #[inline(always)] @@ -18,6 +23,20 @@ fn channels_to_linear( let mut src_offset = 0usize; let mut dst_offset = 0usize; + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + let mut _has_sse = false; + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + if is_x86_feature_detected!("sse4.1") { + _has_sse = true; + } + let transfer = transfer_function.get_linearize_function(); for _ in 0..height as usize { let mut _cx = 0usize; @@ -25,6 +44,24 @@ fn channels_to_linear( let src_ptr = unsafe { src.as_ptr().add(src_offset) }; let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + _cx = sse_plane_to_linear( + _cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + transfer_function, + ); + } + } + #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" diff --git a/src/sse/cie.rs b/src/sse/cie.rs index 1981d93..d4c8d93 100644 --- a/src/sse/cie.rs +++ b/src/sse/cie.rs @@ -3,13 +3,14 @@ use crate::luv::{ LUV_WHITE_V_PRIME, }; use crate::sse::{ - _mm_atan2_ps, _mm_cbrt_ps, _mm_color_matrix_ps, _mm_cos_ps, _mm_cube_ps, _mm_hypot_ps, - _mm_prefer_fma_ps, _mm_select_ps, _mm_sin_ps, + _mm_color_matrix_ps, _mm_cube_ps, + _mm_prefer_fma_ps, _mm_select_ps, }; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; +use erydanos::{_mm_atan2_ps, _mm_cbrt_ps, _mm_cos_ps, _mm_hypot_ps, _mm_sin_ps}; #[inline(always)] pub(crate) unsafe fn sse_triple_to_xyz( diff --git a/src/sse/color.rs b/src/sse/color.rs index 517f6e5..84be872 100644 --- a/src/sse/color.rs +++ b/src/sse/color.rs @@ -1,8 +1,9 @@ -use crate::sse::{_mm_abs_ps, _mm_fmod_ps, _mm_prefer_fma_ps, _mm_select_ps}; +use crate::sse::{_mm_prefer_fma_ps, _mm_select_ps}; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; +use erydanos::{_mm_abs_ps, _mm_fmod_ps}; #[inline(always)] pub unsafe fn sse_hsl_to_rgb( diff --git a/src/sse/linear_to_planar.rs b/src/sse/linear_to_planar.rs new file mode 100644 index 0000000..19eb088 --- /dev/null +++ b/src/sse/linear_to_planar.rs @@ -0,0 +1,81 @@ +use crate::sse::{_mm_loadu_ps_x4, _mm_storeu_si128_x4, get_sse_gamma_transfer}; +use crate::TransferFunction; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[inline(always)] +unsafe fn transfer_to_gamma(r: __m128, transfer: &unsafe fn(__m128) -> __m128) -> __m128i { + const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; + let r_f = _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( + transfer(r), + _mm_set1_ps(255f32), + ))); + r_f +} + +#[inline(always)] +unsafe fn process_set( + k: (__m128, __m128, __m128, __m128), + function: &unsafe fn(__m128) -> __m128, +) -> __m128i { + let y0 = transfer_to_gamma(k.0, &function); + let y1 = transfer_to_gamma(k.1, &function); + let y2 = transfer_to_gamma(k.2, &function); + let y3 = transfer_to_gamma(k.3, &function); + + let y_row01 = _mm_packus_epi32(y0, y1); + let y_row23 = _mm_packus_epi32(y2, y3); + + let r_row = _mm_packus_epi16(y_row01, y_row23); + r_row +} + +#[inline] +pub unsafe fn sse_linear_plane_to_gamma( + start_cx: usize, + src: *const f32, + src_offset: u32, + dst: *mut u8, + dst_offset: u32, + width: u32, + transfer_function: TransferFunction, +) -> usize { + let mut cx = start_cx; + + let function = get_sse_gamma_transfer(transfer_function); + + while cx + 64 < width as usize { + let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx); + + let pixel_row0 = _mm_loadu_ps_x4(offset_src_ptr); + let pixel_row1 = _mm_loadu_ps_x4(offset_src_ptr.add(16)); + let pixel_row2 = _mm_loadu_ps_x4(offset_src_ptr.add(32)); + let pixel_row3 = _mm_loadu_ps_x4(offset_src_ptr.add(48)); + + let set0 = process_set(pixel_row0, &function); + let set1 = process_set(pixel_row1, &function); + let set2 = process_set(pixel_row2, &function); + let set3 = process_set(pixel_row3, &function); + + let dst_ptr = dst.add(dst_offset as usize + cx); + + _mm_storeu_si128_x4(dst_ptr, (set0, set1, set2, set3)); + + cx += 64; + } + + while cx + 16 < width as usize { + let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx); + + let pixel_row = _mm_loadu_ps_x4(offset_src_ptr); + let r_row = process_set(pixel_row, &function); + let dst_ptr = dst.add(dst_offset as usize + cx); + _mm_storeu_si128(dst_ptr as *mut __m128i, r_row); + + cx += 16; + } + + cx +} diff --git a/src/sse/math.rs b/src/sse/math.rs index a3defbc..94bfef1 100644 --- a/src/sse/math.rs +++ b/src/sse/math.rs @@ -3,6 +3,8 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; +use erydanos::_mm_pow_ps; + #[inline(always)] pub unsafe fn _mm_cube_ps(x: __m128) -> __m128 { _mm_mul_ps(_mm_mul_ps(x, x), x) @@ -42,40 +44,6 @@ unsafe fn _mm_taylorpoly_ps( return res; } -#[inline(always)] -pub unsafe fn _mm_log_ps(v: __m128) -> __m128 { - let zeros = _mm_setzero_ps(); - let nan_mask = _mm_cmple_ps(v, zeros); - let const_ln127 = _mm_set1_epi32(127); // 127 - let const_ln2 = _mm_set1_ps(std::f32::consts::LN_2); // ln(2) - - // Extract exponent - let m = _mm_sub_epi32(_mm_srli_epi32::<23>(_mm_castps_si128(v)), const_ln127); - let val = _mm_castsi128_ps(_mm_sub_epi32(_mm_castps_si128(v), _mm_slli_epi32::<23>(m))); - - let mut poly = _mm_taylorpoly_ps( - val, - _mm_set1_ps(-2.29561495781f32), - _mm_set1_ps(-2.47071170807f32), - _mm_set1_ps(-5.68692588806f32), - _mm_set1_ps(-0.165253549814f32), - _mm_set1_ps(5.17591238022f32), - _mm_set1_ps(0.844007015228f32), - _mm_set1_ps(4.58445882797f32), - _mm_set1_ps(0.0141278216615f32), - ); - - poly = _mm_prefer_fma_ps(poly, _mm_cvtepi32_ps(m), const_ln2); - - if HANDLE_NAN { - poly = _mm_select_ps(nan_mask, _mm_set1_ps(-f32::INFINITY), poly); - } else { - poly = _mm_select_ps(nan_mask, zeros, poly); - } - - poly -} - #[inline(always)] pub unsafe fn _mm_select_ps(mask: __m128, true_vals: __m128, false_vals: __m128) -> __m128 { _mm_blendv_ps(false_vals, true_vals, mask) @@ -96,131 +64,27 @@ pub unsafe fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __ ) } -#[inline(always)] -pub unsafe fn _mm_exp_ps(x: __m128) -> __m128 { - _mm_exp_ps_ulp_1_5::(x) -} - -#[inline(always)] -pub unsafe fn _mm_exp_ps_ulp_1_5(x: __m128) -> __m128 { - let c1 = _mm_castsi128_ps(_mm_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f - let c2 = _mm_castsi128_ps(_mm_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f - let c3 = _mm_castsi128_ps(_mm_set1_epi32(0x3e2aaf33)); // x^3: 0x1.555e66p-3f - let c4 = _mm_castsi128_ps(_mm_set1_epi32(0x3d2b9f17)); // x^4: 0x1.573e2ep-5f - let c5 = _mm_castsi128_ps(_mm_set1_epi32(0x3c072010)); // x^5: 0x1.0e4020p-7f - - let shift = _mm_castsi128_ps(_mm_set1_epi32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f - let inv_ln2 = _mm_castsi128_ps(_mm_set1_epi32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f - let neg_ln2_hi = _mm_castsi128_ps(_mm_set1_epi32(-1087278592i32)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f - let neg_ln2_lo = _mm_castsi128_ps(_mm_set1_epi32(-1245725042i32)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f - - // Range reduction: - // e^x = 2^n * e^r - // where: - // n = floor(x / ln(2)) - // r = x - n * ln(2) - // - // By adding x / ln(2) with 2^23 + 127 (shift): - // * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 forces decimal part - // of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n) + 127 will occupy - // the whole fraction part of z in FP32 format. - // Subtracting 2^23 + 127 (shift) from z will result in the integer part of x / ln(2) - // (i.e. n) because the decimal part has been pushed out and lost. - // * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent - // in FP32 format. Left shifting z by 23 bits will result in 2^n. - let z = _mm_prefer_fma_ps(shift, x, inv_ln2); - let n = _mm_sub_ps(z, shift); - let scale = _mm_castsi128_ps(_mm_slli_epi32::<23>(_mm_castps_si128(z))); // 2^n - - // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32. - // This outperforms longer Taylor series (3-4 tabs) both in terms of accuracy and performance. - let r_hi = _mm_prefer_fma_ps(x, n, neg_ln2_hi); - let r = _mm_prefer_fma_ps(r_hi, n, neg_ln2_lo); - - // Compute the truncated Taylor series of e^r. - // poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5) - let r2 = _mm_mul_ps(r, r); - - let p1 = _mm_mul_ps(c1, r); - let p23 = _mm_prefer_fma_ps(c2, c3, r); - let p45 = _mm_prefer_fma_ps(c4, c5, r); - let p2345 = _mm_prefer_fma_ps(p23, p45, r2); - let p12345 = _mm_prefer_fma_ps(p1, p2345, r2); - - let mut poly = _mm_prefer_fma_ps(scale, p12345, scale); - - if HANDLE_NAN { - let inf = _mm_set1_ps(f32::INFINITY); - let max_input = _mm_set1_ps(88.37f32); // Approximately ln(2^127.5) - let zero = _mm_set1_ps(0f32); - let min_input = _mm_set1_ps(-86.64f32); // Approximately ln(2^-125) - // Handle underflow and overflow. - poly = _mm_select_ps(_mm_cmplt_ps(x, min_input), zero, poly); - poly = _mm_select_ps(_mm_cmpgt_ps(x, max_input), inf, poly); - } - - return poly; -} - -#[inline(always)] -unsafe fn _mm_exp_ps_ulp_5_impl(x: __m128) -> __m128 { - let l2e = _mm_set1_ps(std::f32::consts::LOG2_E); /* log2(e) */ - let c0 = _mm_set1_ps(0.3371894346f32); - let c1 = _mm_set1_ps(0.657636276f32); - let c2 = _mm_set1_ps(1.00172476f32); - - /* exp(x) = 2^i * 2^f; i = floor (log2(e) * x), 0 <= f <= 1 */ - let t = _mm_mul_ps(x, l2e); /* t = log2(e) * x */ - let e = _mm_floor_ps(t); /* floor(t) */ - let i = _mm_cvtps_epi32(e); /* (int)floor(t) */ - let f = _mm_sub_ps(t, e); /* f = t - floor(t) */ - let mut p = c0; /* c0 */ - p = _mm_prefer_fma_ps(c1, p, f); /* c0 * f + c1 */ - p = _mm_prefer_fma_ps(c2, p, f); /* p = (c0 * f + c1) * f + c2 ~= 2^f */ - let j = _mm_slli_epi32::<23>(i); /* i << 23 */ - let r = _mm_castsi128_ps(_mm_add_epi32(j, _mm_castps_si128(p))); /* r = p * 2^i*/ - if PROCESS_NAN { - let inf = _mm_set1_ps(f32::INFINITY); - let max_input = _mm_set1_ps(88.72283f32); // Approximately ln(2^127.5) - let min_input = _mm_set1_ps(-87.33654f32); // Approximately ln(2^-125) - let poly = _mm_select_ps(_mm_cmplt_ps(x, min_input), _mm_setzero_ps(), r); - let poly = _mm_select_ps(_mm_cmpgt_ps(x, max_input), inf, poly); - return poly; - } else { - return r; - } -} - -#[inline(always)] -pub unsafe fn _mm_pow_ps(x: __m128, n: __m128) -> __m128 { - _mm_exp_ps(_mm_mul_ps(n, _mm_log_ps::(x))) -} - #[inline(always)] pub unsafe fn _mm_pow_n_ps(x: __m128, n: f32) -> __m128 { - _mm_exp_ps(_mm_mul_ps(_mm_set1_ps(n), _mm_log_ps::(x))) + _mm_pow_ps(x, _mm_set1_ps(n)) } #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_signbit_ps(f: __m128) -> __m128i { return _mm_and_si128(_mm_castps_si128(f), _mm_castps_si128(_mm_set1_ps(-0.0f32))); } #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_mulsign_ps(x: __m128, y: __m128) -> __m128 { return _mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(x), _mm_signbit_ps(y))); } #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_pow2i_ps(q: __m128i) -> __m128 { return _mm_castsi128_ps(_mm_slli_epi32::<23>(_mm_add_epi32(q, _mm_set1_epi32(0x7f)))); } #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_vldexp2_ps(d: __m128, e: __m128i) -> __m128 { return _mm_mul_ps( _mm_mul_ps(d, _mm_pow2i_ps(_mm_srli_epi32::<1>(e))), @@ -229,7 +93,6 @@ pub unsafe fn _mm_vldexp2_ps(d: __m128, e: __m128i) -> __m128 { } #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_vilogbk_ps(d: __m128) -> __m128i { let o = _mm_cmplt_ps(d, _mm_set1_ps(5.421010862427522E-20f32)); let d = _mm_select_ps(o, _mm_mul_ps(_mm_set1_ps(1.8446744073709552E19f32), d), d); @@ -253,13 +116,6 @@ pub(crate) unsafe fn _mm_fmaf_ps(a: __m128, b: __m128, c: __m128) -> __m128 { _mm_prefer_fma_ps(c, b, a) } -#[inline(always)] -#[allow(dead_code)] -pub(crate) unsafe fn _mm_abs_ps(x: __m128) -> __m128 { - let sign_mask = _mm_set1_ps(-0f32); - return _mm_andnot_ps(sign_mask, x); -} - #[inline(always)] #[allow(dead_code)] pub(crate) unsafe fn _mm_neg_epi32(x: __m128i) -> __m128i { @@ -273,64 +129,6 @@ pub(crate) unsafe fn _mm_neg_ps(x: __m128) -> __m128 { return _mm_sub_ps(high, x); } -#[inline(always)] -/// This is Cube Root using Pow functions, -/// it is also precise however due to of inexact nature of power 1/3 result slightly differ -/// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5 -pub unsafe fn _mm_cbrt_ps(d: __m128) -> __m128 { - _mm_cbrt_ulp2_ps::(d) -} - -#[inline(always)] -#[allow(dead_code)] -/// Precise version of Cube Root, ULP 3.5 -pub unsafe fn _mm_cbrt_ps_ulp35(d: __m128) -> __m128 { - let mut q = _mm_set1_ps(1f32); - let e = _mm_add_epi32(_mm_vilogbk_ps(_mm_abs_ps(d)), _mm_set1_epi32(1)); - let mut d = _mm_vldexp2_ps(d, _mm_neg_epi32(e)); - - let t = _mm_add_ps(_mm_cvtepi32_ps(e), _mm_set1_ps(6144f32)); - let qu = _mm_cvttps_epi32(_mm_mul_ps(t, _mm_set1_ps(1.0f32 / 3.0f32))); - let re = _mm_cvttps_epi32(_mm_sub_ps( - t, - _mm_mul_ps(_mm_cvtepi32_ps(qu), _mm_set1_ps(3f32)), - )); - - q = _mm_selecti_ps( - _mm_cmpeq_epi32(re, _mm_set1_epi32(1)), - _mm_set1_ps(1.2599210498948731647672106f32), - q, - ); - q = _mm_selecti_ps( - _mm_cmpeq_epi32(re, _mm_set1_epi32(2)), - _mm_set1_ps(1.5874010519681994747517056f32), - q, - ); - q = _mm_vldexp2_ps(q, _mm_sub_epi32(qu, _mm_set1_epi32(2048))); - q = _mm_mulsign_ps(q, d); - d = _mm_abs_ps(d); - - let mut x = _mm_set1_ps(-0.601564466953277587890625f32); - x = _mm_fmaf_ps(x, d, _mm_set1_ps(2.8208892345428466796875f32)); - x = _mm_fmaf_ps(x, d, _mm_set1_ps(-5.532182216644287109375f32)); - x = _mm_fmaf_ps(x, d, _mm_set1_ps(5.898262500762939453125f32)); - x = _mm_fmaf_ps(x, d, _mm_set1_ps(-3.8095417022705078125f32)); - x = _mm_fmaf_ps(x, d, _mm_set1_ps(2.2241256237030029296875f32)); - - let mut y = _mm_mul_ps(_mm_mul_ps(d, x), x); - y = _mm_mul_ps( - _mm_sub_ps( - y, - _mm_mul_ps( - _mm_mul_ps(y, _mm_set1_ps(2.0f32 / 3.0f32)), - _mm_fmaf_ps(y, x, _mm_set1_ps(-1.0f32)), - ), - ), - q, - ); - return y; -} - #[inline(always)] pub unsafe fn _mm_cmpge_epi32(a: __m128i, b: __m128i) -> __m128i { let gt = _mm_cmpgt_epi32(a, b); @@ -344,58 +142,6 @@ pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i { } #[inline(always)] -/// Precise version of Cube Root with ULP 2 -pub unsafe fn _mm_cbrt_ulp2_ps(x: __m128) -> __m128 { - let x1p24 = _mm_castsi128_ps(_mm_set1_epi32(0x4b800000)); // 0x1p24f === 2 ^ 24 - - let mut ui = _mm_cvtps_epi32(x); - let hx = _mm_and_si128(ui, _mm_set1_epi32(0x7fffffff)); - - let nan_mask = _mm_cmpge_epi32(hx, _mm_set1_epi32(0x7f800000)); - let is_zero_mask = _mm_cmpeq_epi32(hx, _mm_setzero_si128()); - - let lo_mask = _mm_cmplt_epi32(hx, _mm_set1_epi32(0x00800000)); - let hi_ui_f = _mm_castps_si128(_mm_mul_ps(x, x1p24)); - let mut lo_hx = _mm_and_si128(hi_ui_f, _mm_set1_epi32(0x7fffffff)); - let recpeq_3 = _mm_set1_ps(1f32 / 3f32); - lo_hx = _mm_add_epi32( - _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(lo_hx), recpeq_3)), - _mm_set1_epi32(642849266), - ); - let hi_hx = _mm_add_epi32( - _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(hx), recpeq_3)), - _mm_set1_epi32(709958130), - ); - let hx = _mm_select_si128(lo_mask, lo_hx, hi_hx); - - ui = _mm_select_si128(lo_mask, hi_ui_f, ui); - ui = _mm_and_si128(ui, _mm_set1_epi32(-2147483648i32)); - ui = _mm_or_si128(ui, hx); - - let mut t = _mm_castsi128_ps(ui); - let mut r = _mm_mul_ps(_mm_mul_ps(t, t), t); - - let sum_x = _mm_add_ps(x, x); - - t = _mm_mul_ps( - _mm_div_ps(_mm_add_ps(sum_x, r), _mm_add_ps(_mm_add_ps(r, r), x)), - t, - ); - - r = _mm_mul_ps(_mm_mul_ps(t, t), t); - t = _mm_mul_ps( - _mm_div_ps(_mm_add_ps(sum_x, r), _mm_add_ps(_mm_add_ps(r, r), x)), - t, - ); - if HANDLE_NAN { - t = _mm_selecti_ps(nan_mask, _mm_set1_ps(f32::NAN), t); - t = _mm_selecti_ps(is_zero_mask, _mm_setzero_ps(), t); - } - t -} - -#[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_color_matrix_ps( r: __m128, g: __m128, @@ -416,70 +162,6 @@ pub unsafe fn _mm_color_matrix_ps( (new_r, new_g, new_b) } -#[inline(always)] -#[allow(dead_code)] -pub(crate) unsafe fn _mm_fmod_ps(a: __m128, b: __m128) -> __m128 { - let dividend_vec = a; - let divisor_vec = b; - let division = _mm_mul_ps(dividend_vec, _mm_rcp_ps(divisor_vec)); // Perform division - let int_part = _mm_floor_ps(division); // Get the integer part using floor - let product = _mm_mul_ps(int_part, divisor_vec); // Multiply the integer part by the divisor - let remainder = _mm_sub_ps(dividend_vec, product); // Subtract the product from the dividend - remainder -} - -#[inline(always)] -#[allow(dead_code)] -pub unsafe fn _mm_is_infinity(d: __m128) -> __m128 { - return _mm_cmpeq_ps(_mm_abs_ps(d), _mm_set1_ps(f32::INFINITY)); -} - -#[inline(always)] -#[allow(dead_code)] -pub unsafe fn _mm_cos_ps(d: __m128) -> __m128 { - let mut q = _mm_cvtps_epi32(_mm_sub_ps( - _mm_mul_ps(d, _mm_set1_ps(std::f32::consts::FRAC_1_PI)), - _mm_set1_ps(0.5f32), - )); - - q = _mm_add_epi32(_mm_add_epi32(q, q), _mm_set1_epi32(1)); - - let mut u = _mm_cvtepi32_ps(q); - let mut d = _mm_fmaf_ps(u, _mm_set1_ps(-0.78515625f32 * 2f32), d); - d = _mm_fmaf_ps(u, _mm_set1_ps(-0.00024187564849853515625f32 * 2f32), d); - d = _mm_fmaf_ps(u, _mm_set1_ps(-3.7747668102383613586e-08f32 * 2f32), d); - d = _mm_fmaf_ps(u, _mm_set1_ps(-1.2816720341285448015e-12f32 * 2f32), d); - - let s = _mm_mul_ps(d, d); - - d = _mm_castsi128_ps(_mm_xor_si128( - _mm_and_si128( - _mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(2)), _mm_set1_epi32(0)), - _mm_castps_si128(_mm_set1_ps(-0.0f32)), - ), - _mm_castps_si128(d), - )); - - u = _mm_set1_ps(2.6083159809786593541503e-06f32); - u = _mm_fmaf_ps(u, s, _mm_set1_ps(-0.0001981069071916863322258f32)); - u = _mm_fmaf_ps(u, s, _mm_set1_ps(0.00833307858556509017944336f32)); - u = _mm_fmaf_ps(u, s, _mm_set1_ps(-0.166666597127914428710938f32)); - - u = _mm_fmaf_ps(s, _mm_mul_ps(u, d), d); - - u = _mm_or_ps(_mm_is_infinity(d), u); - - return u; -} - -#[inline(always)] -pub unsafe fn _mm_hypot_ps(x: __m128, y: __m128) -> __m128 { - let xp2 = _mm_mul_ps(x, x); - let yp2 = _mm_mul_ps(y, y); - let z = _mm_add_ps(xp2, yp2); - return _mm_sqrt_ps(z); -} - #[inline(always)] pub unsafe fn _mm_poly4_ps( x: __m128, @@ -512,106 +194,3 @@ pub unsafe fn _mm_poly8q_ps( _mm_poly4_ps(x, x2, c3, c2, c1, c0), ) } - -#[inline(always)] -unsafe fn _mm_atan2q_ps_impl(y: __m128, x: __m128) -> __m128 { - let q = _mm_select_si128( - _mm_castps_si128(_mm_cmplt_ps(x, _mm_setzero_ps())), - _mm_set1_epi32(-2), - _mm_set1_epi32(0), - ); - let x = _mm_abs_ps(x); - let is_y_more_than_x = _mm_cmpgt_ps(y, x); - let t = _mm_select_ps(is_y_more_than_x, x, _mm_setzero_ps()); - let x = _mm_select_ps(is_y_more_than_x, y, x); - let y = _mm_select_ps(is_y_more_than_x, _mm_neg_ps(t), y); - let q = _mm_select_si128( - _mm_castps_si128(is_y_more_than_x), - _mm_add_epi32(q, _mm_set1_epi32(1)), - q, - ); - let s = _mm_div_ps(y, x); - let t = _mm_mul_ps(s, s); - let t2 = _mm_mul_ps(t, t); - let t4 = _mm_mul_ps(t2, t2); - let poly = _mm_poly8q_ps( - t, - t2, - t4, - _mm_set1_ps(0.00282363896258175373077393f32), - _mm_set1_ps(-0.0159569028764963150024414f32), - _mm_set1_ps(0.0425049886107444763183594f32), - _mm_set1_ps(-0.0748900920152664184570312f32), - _mm_set1_ps(0.106347933411598205566406f32), - _mm_set1_ps(-0.142027363181114196777344f32), - _mm_set1_ps(0.199926957488059997558594f32), - _mm_set1_ps(-0.333331018686294555664062f32), - ); - let t = _mm_prefer_fma_ps(s, _mm_mul_ps(poly, t), s); - let t = _mm_prefer_fma_ps( - t, - _mm_cvtepi32_ps(q), - _mm_set1_ps(std::f32::consts::FRAC_PI_2), - ); - t -} - -#[inline(always)] -pub unsafe fn _mm_atan2_ps(y: __m128, x: __m128) -> __m128 { - let r = _mm_atan2q_ps_impl(_mm_abs_ps(y), x); - let mut r = _mm_mulsign_ps(r, x); - let zeros = _mm_setzero_ps(); - let y_zero_mask = _mm_cmpeq_ps(y, zeros); - r = _mm_select_ps( - _mm_cmpeq_ps(x, zeros), - _mm_set1_ps(std::f32::consts::FRAC_PI_2), - r, - ); - r = _mm_select_ps(y_zero_mask, zeros, r); - _mm_mulsign_ps(r, y) -} - -#[inline(always)] -pub unsafe fn _mm_sin_ps(val: __m128) -> __m128 { - let pi_v = _mm_set1_ps(std::f32::consts::PI); - let pio2_v = _mm_set1_ps(std::f32::consts::FRAC_PI_2); - let ipi_v = _mm_set1_ps(std::f32::consts::FRAC_1_PI); - - //Find positive or negative - let c_v = _mm_abs_epi32(_mm_cvtps_epi32(_mm_mul_ps(val, ipi_v))); - let sign_v = _mm_castps_si128(_mm_cmple_ps(val, _mm_setzero_ps())); - let odd_v = _mm_and_si128(c_v, _mm_set1_epi32(1)); - - let neg_v = _mm_xor_si128(odd_v, sign_v); - - //Modulus a - (n * int(a*(1/n))) - let mut ma = _mm_sub_ps(_mm_abs_ps(val), _mm_mul_ps(pi_v, _mm_cvtepi32_ps(c_v))); - let reb_v = _mm_cmpge_ps(ma, pio2_v); - - //Rebase a between 0 and pi/2 - ma = _mm_select_ps(reb_v, _mm_sub_ps(pi_v, ma), ma); - - //Taylor series - let ma2 = _mm_mul_ps(ma, ma); - - //2nd elem: x^3 / 3! - let mut elem = _mm_mul_ps(_mm_mul_ps(ma, ma2), _mm_set1_ps(0.166666666666f32)); - let mut res = _mm_sub_ps(ma, elem); - - //3rd elem: x^5 / 5! - elem = _mm_mul_ps(_mm_mul_ps(elem, ma2), _mm_set1_ps(0.05f32)); - res = _mm_add_ps(res, elem); - - //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val) - elem = _mm_mul_ps(_mm_mul_ps(elem, ma2), _mm_set1_ps(0.023809523810f32)); - res = _mm_sub_ps(res, elem); - - //5th elem: x^9 / 9! - elem = _mm_mul_ps(_mm_mul_ps(elem, ma2), _mm_set1_ps(0.013888888889f32)); - res = _mm_add_ps(res, elem); - - //Change of sign - let neg_v = _mm_slli_epi32::<31>(neg_v); - res = _mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(res), neg_v)); - return res; -} diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 024129b..4803b21 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -36,6 +36,8 @@ mod from_sigmoidal; mod sigmoidal; mod to_sigmoidal; mod xyza_laba_to_image; +mod planar_to_linear; +mod linear_to_planar; pub use from_sigmoidal::sse_from_sigmoidal_row; pub use gamma_curves::*; @@ -51,3 +53,5 @@ pub use to_xyz_lab::*; pub use to_xyza_laba::*; pub use xyz_lab_to_image::*; pub use xyza_laba_to_image::*; +pub use planar_to_linear::sse_plane_to_linear; +pub use linear_to_planar::sse_linear_plane_to_gamma; \ No newline at end of file diff --git a/src/sse/planar_to_linear.rs b/src/sse/planar_to_linear.rs new file mode 100644 index 0000000..4eba72d --- /dev/null +++ b/src/sse/planar_to_linear.rs @@ -0,0 +1,86 @@ +use crate::sse::{_mm_loadu_si128_x4, _mm_storeu_ps_x4, get_sse_linear_transfer}; +use crate::TransferFunction; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[inline(always)] +unsafe fn sse_to_linear(r: __m128i, transfer: &unsafe fn(__m128) -> __m128) -> __m128 { + let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), _mm_set1_ps(1f32 / 255f32)); + transfer(r_f) +} + +#[inline] +unsafe fn process_pixels( + pixels: __m128i, + transfer: &unsafe fn(__m128) -> __m128, +) -> (__m128, __m128, __m128, __m128) { + let zeros = _mm_setzero_si128(); + let r_low = _mm_unpacklo_epi8(pixels, zeros); + + let r_low_low = _mm_unpacklo_epi16(r_low, zeros); + + let x_low_low = sse_to_linear(r_low_low, &transfer); + + let r_low_high = _mm_unpackhi_epi16(r_low, zeros); + + let x_low_high = sse_to_linear(r_low_high, &transfer); + + let r_high = _mm_unpackhi_epi8(pixels, zeros); + + let r_high_low = _mm_unpacklo_epi16(r_high, zeros); + + let x_high_low = sse_to_linear(r_high_low, &transfer); + + let r_high_high = _mm_unpackhi_epi16(r_high, zeros); + + let x_high_high = sse_to_linear(r_high_high, &transfer); + + (x_low_low, x_low_high, x_high_low, x_high_high) +} + +#[inline(always)] +pub unsafe fn sse_plane_to_linear( + start_cx: usize, + src: *const u8, + src_offset: usize, + width: u32, + dst: *mut f32, + dst_offset: usize, + transfer_function: TransferFunction, +) -> usize { + let mut cx = start_cx; + let transfer = get_sse_linear_transfer(transfer_function); + + let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; + + while cx + 64 < width as usize { + let src_ptr = src.add(src_offset + cx); + let pixels_row64 = _mm_loadu_si128_x4(src_ptr); + let storing_row0 = process_pixels(pixels_row64.0, &transfer); + _mm_storeu_ps_x4(dst_ptr.add(cx), storing_row0); + + let storing_row1 = process_pixels(pixels_row64.1, &transfer); + _mm_storeu_ps_x4(dst_ptr.add(cx + 16), storing_row1); + + let storing_row2 = process_pixels(pixels_row64.2, &transfer); + _mm_storeu_ps_x4(dst_ptr.add(cx + 32), storing_row2); + + let storing_row3 = process_pixels(pixels_row64.3, &transfer); + _mm_storeu_ps_x4(dst_ptr.add(cx + 48), storing_row3); + + cx += 64; + } + + while cx + 16 < width as usize { + let src_ptr = src.add(src_offset + cx); + let pixels = _mm_loadu_si128(src_ptr as *const __m128i); + let storing_row = process_pixels(pixels, &transfer); + _mm_storeu_ps_x4(dst_ptr.add(cx), storing_row); + + cx += 16; + } + + cx +} diff --git a/src/sse/sigmoidal.rs b/src/sse/sigmoidal.rs index c1a13c7..f6916a7 100644 --- a/src/sse/sigmoidal.rs +++ b/src/sse/sigmoidal.rs @@ -1,8 +1,9 @@ -use crate::sse::{_mm_exp_ps, _mm_log_ps, _mm_neg_ps, _mm_select_ps}; +use crate::sse::{_mm_neg_ps, _mm_select_ps}; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; +use erydanos::{_mm_exp_ps, _mm_ln_fast_ps}; #[inline(always)] pub(crate) unsafe fn sse_color_to_sigmoidal(x: __m128) -> __m128 { @@ -21,7 +22,7 @@ pub(crate) unsafe fn sse_sigmoidal_to_color(x: __m128) -> __m128 { let k = _mm_mul_ps(x, _mm_rcp_ps(den)); let zeros = _mm_setzero_ps(); let zero_mask_2 = _mm_cmple_ps(k, zeros); - let ln = _mm_log_ps::(k); + let ln = _mm_ln_fast_ps(k); let rs = _mm_select_ps(_mm_and_ps(zero_mask_1, zero_mask_2), zeros, ln); return rs; } diff --git a/src/sse/support.rs b/src/sse/support.rs index 340ee75..b05b5ef 100644 --- a/src/sse/support.rs +++ b/src/sse/support.rs @@ -8,27 +8,7 @@ pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 { ((z << 6) | (y << 4) | (x << 2) | w) as i32 } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(always)] -#[allow(dead_code)] -pub unsafe fn sse_promote_i16_toi32(s: __m128i) -> __m128i { - _mm_cvtepi16_epi32(_mm_srli_si128::<8>(s)) -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] -pub unsafe fn sse_interleave_even(x: __m128i) -> __m128i { - #[rustfmt::skip] - let shuffle = _mm_setr_epi8(0, 0, 2, 2, 4, 4, 6, 6, - 8, 8, 10, 10, 12, 12, 14, 14); - let new_lane = _mm_shuffle_epi8(x, shuffle); - return new_lane; -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(always)] -#[allow(dead_code)] pub unsafe fn sse_interleave_rgba( r: __m128i, g: __m128i, @@ -355,17 +335,6 @@ pub unsafe fn sse_store_rgb_u8(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i) _mm_storeu_si128(ptr.add(32) as *mut __m128i, v2); } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(always)] -#[allow(dead_code)] -pub unsafe fn sse_div_by255(v: __m128i) -> __m128i { - let rounding = _mm_set1_epi16(1 << 7); - let x = _mm_adds_epi16(v, rounding); - let multiplier = _mm_set1_epi16(-32640); - let r = _mm_mulhi_epu16(x, multiplier); - return _mm_srli_epi16::<7>(r); -} - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] #[allow(dead_code)] @@ -385,3 +354,39 @@ pub unsafe fn sse_deinterleave_rgba_ps( let v3 = _mm_unpackhi_ps(t02hi, t13hi); (v0, v1, v2, v3) } + +#[inline(always)] +pub unsafe fn _mm_loadu_si128_x4(ptr: *const u8) -> (__m128i, __m128i, __m128i, __m128i) { + ( + _mm_loadu_si128(ptr as *const __m128i), + _mm_loadu_si128(ptr.add(16) as *const __m128i), + _mm_loadu_si128(ptr.add(32) as *const __m128i), + _mm_loadu_si128(ptr.add(48) as *const __m128i), + ) +} + +#[inline(always)] +pub unsafe fn _mm_storeu_ps_x4(ptr: *mut f32, set: (__m128, __m128, __m128, __m128)) { + _mm_storeu_ps(ptr, set.0); + _mm_storeu_ps(ptr.add(4), set.1); + _mm_storeu_ps(ptr.add(8), set.2); + _mm_storeu_ps(ptr.add(12), set.3); +} + +#[inline(always)] +pub unsafe fn _mm_loadu_ps_x4(ptr: *const f32) -> (__m128, __m128, __m128, __m128) { + ( + _mm_loadu_ps(ptr), + _mm_loadu_ps(ptr.add(4)), + _mm_loadu_ps(ptr.add(8)), + _mm_loadu_ps(ptr.add(12)), + ) +} + +#[inline(always)] +pub unsafe fn _mm_storeu_si128_x4(ptr: *mut u8, set: (__m128i, __m128i, __m128i, __m128i)) { + _mm_storeu_si128(ptr as * mut __m128i, set.0); + _mm_storeu_si128(ptr.add(16) as * mut __m128i, set.1); + _mm_storeu_si128(ptr.add(32) as * mut __m128i, set.2); + _mm_storeu_si128(ptr.add(48) as * mut __m128i, set.3); +} \ No newline at end of file