diff --git a/Cargo.lock b/Cargo.lock
index 1789089..9c24aae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -94,15 +94,15 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitstream-io"
-version = "2.3.0"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c12d1856e42f0d817a835fe55853957c85c8c8a470114029143d3f12671446e"
+checksum = "3dcde5f311c85b8ca30c2e4198d4326bc342c76541590106f5fa4a50946ea499"
 
 [[package]]
 name = "built"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6a6c0b39c38fd754ac338b00a88066436389c0f029da5d37d1e01091d9b7c17"
+checksum = "236e6289eda5a812bc6b53c3b024039382a2895fbbeef2d748b2931546d392c4"
 
 [[package]]
 name = "bumpalo"
@@ -112,9 +112,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "bytemuck"
-version = "1.16.0"
+version = "1.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
+checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
 
 [[package]]
 name = "byteorder"
@@ -130,9 +130,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
 
 [[package]]
 name = "cc"
-version = "1.0.98"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f"
+checksum = "eaff6f8ce506b9773fa786672d63fc7a191ffea1be33f72bbd4aeacefca9ffc8"
 dependencies = [
  "jobserver",
  "libc",
@@ -163,7 +163,7 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 
 [[package]]
 name = "colorutils-rs"
-version = "0.4.12"
+version = "0.4.13"
 dependencies = [
  "erydanos",
  "half",
@@ -211,9 +211,9 @@ checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
 
 [[package]]
 name = "either"
-version = "1.12.0"
+version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
+checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
 [[package]]
 name = "equivalent"
@@ -223,9 +223,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "erydanos"
-version = "0.1.0"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b0354c3359e57ded8b5f8a120273cb1da304630399da59afa14182404573d6f"
+checksum = "1a140744bdb5b8777d9714a8d6a72c5e58d4eb2b0c3c8a85c8bada86efd9fa21"
 dependencies = [
  "num-traits",
 ]
@@ -436,9 +436,9 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.21"
+version = "0.4.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
+checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
 
 [[package]]
 name = "loop9"
@@ -461,9 +461,9 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.7.2"
+version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
 [[package]]
 name = "minimal-lexical"
@@ -473,9 +473,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
 dependencies = [
  "adler",
  "simd-adler32",
@@ -505,9 +505,9 @@ checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8"
 
 [[package]]
 name = "num-bigint"
-version = "0.4.5"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c165a9ab64cf766f73521c0dd2cfdff64f488b8f0b3e621face3462d3db536d7"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
 dependencies = [
  "num-integer",
  "num-traits",
@@ -592,9 +592,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.84"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
 dependencies = [
  "unicode-ident",
 ]
@@ -709,9 +709,9 @@ dependencies = [
 
 [[package]]
 name = "ravif"
-version = "0.11.5"
+version = "0.11.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc13288f5ab39e6d7c9d501759712e6969fcc9734220846fc9ed26cae2cc4234"
+checksum = "c6ba61c28ba24c0cf8406e025cb29a742637e3f70776e61c27a8a8b72a042d12"
 dependencies = [
  "avif-serialize",
  "imgref",
@@ -744,9 +744,9 @@ dependencies = [
 
 [[package]]
 name = "rgb"
-version = "0.8.37"
+version = "0.8.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05aaa8004b64fd573fc9d002f4e632d51ad4f026c2b5ba95fcb6c2f32c2c47d8"
+checksum = "1aee83dc281d5a3200d37b299acd13b81066ea126a7f16f0eae70fc9aed241d9"
 dependencies = [
  "bytemuck",
 ]
@@ -759,18 +759,18 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "serde"
-version = "1.0.203"
+version = "1.0.204"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
+checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.203"
+version = "1.0.204"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
+checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -818,9 +818,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.66"
+version = "2.0.70"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5"
+checksum = "2f0209b68b3613b093e0ec905354eccaedcfe83b8cb37cbdeae64026c3064c16"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -842,9 +842,9 @@ dependencies = [
 
 [[package]]
 name = "target-lexicon"
-version = "0.12.14"
+version = "0.12.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f"
+checksum = "4873307b7c257eddcb50c9bedf158eb669578359fb28428bef438fec8e6ba7c2"
 
 [[package]]
 name = "thiserror"
@@ -879,9 +879,9 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.8.13"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4e43f8cc456c9704c851ae29c67e17ef65d2c30017c17a9765b89c382dc8bba"
+checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335"
 dependencies = [
  "serde",
  "serde_spanned",
@@ -900,9 +900,9 @@ dependencies = [
 
 [[package]]
 name = "toml_edit"
-version = "0.22.13"
+version = "0.22.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c127785850e8c20836d49732ae6abfa47616e60bf9d9f57c43c250361a9db96c"
+checksum = "d59a3a72298453f564e2b111fa896f8d07fabb36f51f06d7e875fc5e0b5a3ef1"
 dependencies = [
  "indexmap",
  "serde",
@@ -1002,9 +1002,9 @@ checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082"
 
 [[package]]
 name = "winnow"
-version = "0.6.9"
+version = "0.6.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86c949fede1d13936a99f14fafd3e76fd642b556dd2ce96287fbe2e0151bfac6"
+checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
 dependencies = [
  "memchr",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 35950dd..d0d17d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["src/app"] }
 
 [package]
 name = "colorutils-rs"
-version = "0.4.12"
+version = "0.4.13"
 edition = "2021"
 description = "High performance utilities for color format handling and conversion."
 readme = "README.md"
@@ -16,7 +16,7 @@ repository = "https://github.com/awxkee/colorutils-rs"
 exclude = ["*.jpg"]
 
 [dependencies]
-erydanos = "0.1.0"
+erydanos = "0.2.3"
 half = "2.4.1"
 
 [features]
diff --git a/src/app/src/main.rs b/src/app/src/main.rs
index 9bbdfae..5992c39 100644
--- a/src/app/src/main.rs
+++ b/src/app/src/main.rs
@@ -58,7 +58,7 @@ fn main() {
         lab_store.resize(width as usize * components * height as usize, 0f32);
         let src_stride = width * components as u32;
         let start_time = Instant::now();
-        rgb_to_lab(
+        rgb_to_lch(
             src_bytes,
             src_stride,
             &mut lab_store,
@@ -92,7 +92,7 @@ fn main() {
         // }
 
         let start_time = Instant::now();
-        lab_to_srgb(
+        lch_to_rgb(
             &lab_store,
             store_stride as u32,
             &mut dst_slice,
diff --git a/src/linear_to_planar.rs b/src/linear_to_planar.rs
index ba90d4e..2b5a9c8 100644
--- a/src/linear_to_planar.rs
+++ b/src/linear_to_planar.rs
@@ -3,6 +3,11 @@
     target_feature = "neon"
 ))]
 use crate::neon::linear_to_planar::neon_linear_plane_to_gamma;
+#[cfg(all(
+    any(target_arch = "x86_64", target_arch = "x86"),
+    target_feature = "sse4.1"
+))]
+use crate::sse::sse_linear_plane_to_gamma;
 use crate::TransferFunction;
 
 #[inline(always)]
@@ -20,6 +25,20 @@ fn linear_to_gamma_channels(
 
     let transfer = transfer_function.get_gamma_function();
 
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
+    let mut _has_sse = false;
+
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
+    if is_x86_feature_detected!("sse4.1") {
+        _has_sse = true;
+    }
+
     for _ in 0..height as usize {
         let mut _cx = 0usize;
 
@@ -39,6 +58,24 @@ fn linear_to_gamma_channels(
             );
         }
 
+        #[cfg(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            target_feature = "sse4.1"
+        ))]
+        unsafe {
+            if _has_sse {
+                _cx = sse_linear_plane_to_gamma(
+                    _cx,
+                    src.as_ptr(),
+                    src_offset as u32,
+                    dst.as_mut_ptr(),
+                    dst_offset as u32,
+                    width,
+                    transfer_function,
+                );
+            }
+        }
+
         let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *const f32 };
         let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) };
 
diff --git a/src/neon/cie.rs b/src/neon/cie.rs
index 4630546..32d5fe1 100644
--- a/src/neon/cie.rs
+++ b/src/neon/cie.rs
@@ -3,12 +3,8 @@ use crate::luv::{
     LUV_WHITE_V_PRIME,
 };
 use crate::neon::math::{prefer_vfmaq_f32, vcolorq_matrix_f32, vcubeq_f32};
-use erydanos::neon::atan2f::vatan2q_f32;
-use erydanos::neon::cbrtf::vcbrtq_f32;
-use erydanos::neon::cosf::vcosq_f32;
-use erydanos::neon::hypotf::vhypotq_fast_f32;
-use erydanos::neon::sinf::vsinq_f32;
 use std::arch::aarch64::*;
+use erydanos::{vatan2q_f32, vcbrtq_f32, vcosq_f32, vhypotq_fast_f32, vsinq_f32};
 
 #[inline(always)]
 pub(crate) unsafe fn neon_triple_to_xyz(
diff --git a/src/neon/math.rs b/src/neon/math.rs
index 2852e44..b1a7ad4 100644
--- a/src/neon/math.rs
+++ b/src/neon/math.rs
@@ -1,6 +1,5 @@
 use std::arch::aarch64::*;
-
-use erydanos::neon::powf::vpowq_fast_f32;
+use erydanos::vpowq_fast_f32;
 
 #[inline(always)]
 #[allow(dead_code)]
diff --git a/src/planar_to_linear.rs b/src/planar_to_linear.rs
index 7bf4672..380c0d8 100644
--- a/src/planar_to_linear.rs
+++ b/src/planar_to_linear.rs
@@ -3,6 +3,11 @@
     target_feature = "neon"
 ))]
 use crate::neon::planar_to_linear::neon_plane_to_linear;
+#[cfg(all(
+    any(target_arch = "x86_64", target_arch = "x86"),
+    target_feature = "sse4.1"
+))]
+use crate::sse::sse_plane_to_linear;
 use crate::TransferFunction;
 
 #[inline(always)]
@@ -18,6 +23,20 @@ fn channels_to_linear(
     let mut src_offset = 0usize;
     let mut dst_offset = 0usize;
 
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
+    let mut _has_sse = false;
+
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
+    if is_x86_feature_detected!("sse4.1") {
+        _has_sse = true;
+    }
+
     let transfer = transfer_function.get_linearize_function();
     for _ in 0..height as usize {
         let mut _cx = 0usize;
@@ -25,6 +44,24 @@ fn channels_to_linear(
         let src_ptr = unsafe { src.as_ptr().add(src_offset) };
         let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };
 
+        #[cfg(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            target_feature = "sse4.1"
+        ))]
+        unsafe {
+            if _has_sse {
+                _cx = sse_plane_to_linear(
+                    _cx,
+                    src.as_ptr(),
+                    src_offset,
+                    width,
+                    dst.as_mut_ptr(),
+                    dst_offset,
+                    transfer_function,
+                );
+            }
+        }
+
         #[cfg(all(
             any(target_arch = "aarch64", target_arch = "arm"),
             target_feature = "neon"
diff --git a/src/sse/cie.rs b/src/sse/cie.rs
index 1981d93..d4c8d93 100644
--- a/src/sse/cie.rs
+++ b/src/sse/cie.rs
@@ -3,13 +3,14 @@ use crate::luv::{
     LUV_WHITE_V_PRIME,
 };
 use crate::sse::{
-    _mm_atan2_ps, _mm_cbrt_ps, _mm_color_matrix_ps, _mm_cos_ps, _mm_cube_ps, _mm_hypot_ps,
-    _mm_prefer_fma_ps, _mm_select_ps, _mm_sin_ps,
+    _mm_color_matrix_ps, _mm_cube_ps,
+    _mm_prefer_fma_ps, _mm_select_ps,
 };
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
+use erydanos::{_mm_atan2_ps, _mm_cbrt_ps, _mm_cos_ps, _mm_hypot_ps, _mm_sin_ps};
 
 #[inline(always)]
 pub(crate) unsafe fn sse_triple_to_xyz(
diff --git a/src/sse/color.rs b/src/sse/color.rs
index 517f6e5..84be872 100644
--- a/src/sse/color.rs
+++ b/src/sse/color.rs
@@ -1,8 +1,9 @@
-use crate::sse::{_mm_abs_ps, _mm_fmod_ps, _mm_prefer_fma_ps, _mm_select_ps};
+use crate::sse::{_mm_prefer_fma_ps, _mm_select_ps};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
+use erydanos::{_mm_abs_ps, _mm_fmod_ps};
 
 #[inline(always)]
 pub unsafe fn sse_hsl_to_rgb(
diff --git a/src/sse/linear_to_planar.rs b/src/sse/linear_to_planar.rs
new file mode 100644
index 0000000..19eb088
--- /dev/null
+++ b/src/sse/linear_to_planar.rs
@@ -0,0 +1,81 @@
+use crate::sse::{_mm_loadu_ps_x4, _mm_storeu_si128_x4, get_sse_gamma_transfer};
+use crate::TransferFunction;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[inline(always)]
+unsafe fn transfer_to_gamma(r: __m128, transfer: &unsafe fn(__m128) -> __m128) -> __m128i {
+    const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+    let r_f = _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(_mm_mul_ps(
+        transfer(r),
+        _mm_set1_ps(255f32),
+    )));
+    r_f
+}
+
+#[inline(always)]
+unsafe fn process_set(
+    k: (__m128, __m128, __m128, __m128),
+    function: &unsafe fn(__m128) -> __m128,
+) -> __m128i {
+    let y0 = transfer_to_gamma(k.0, &function);
+    let y1 = transfer_to_gamma(k.1, &function);
+    let y2 = transfer_to_gamma(k.2, &function);
+    let y3 = transfer_to_gamma(k.3, &function);
+
+    let y_row01 = _mm_packus_epi32(y0, y1);
+    let y_row23 = _mm_packus_epi32(y2, y3);
+
+    let r_row = _mm_packus_epi16(y_row01, y_row23);
+    r_row
+}
+
+#[inline]
+pub unsafe fn sse_linear_plane_to_gamma(
+    start_cx: usize,
+    src: *const f32,
+    src_offset: u32,
+    dst: *mut u8,
+    dst_offset: u32,
+    width: u32,
+    transfer_function: TransferFunction,
+) -> usize {
+    let mut cx = start_cx;
+
+    let function = get_sse_gamma_transfer(transfer_function);
+
+    while cx + 64 < width as usize {
+        let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx);
+
+        let pixel_row0 = _mm_loadu_ps_x4(offset_src_ptr);
+        let pixel_row1 = _mm_loadu_ps_x4(offset_src_ptr.add(16));
+        let pixel_row2 = _mm_loadu_ps_x4(offset_src_ptr.add(32));
+        let pixel_row3 = _mm_loadu_ps_x4(offset_src_ptr.add(48));
+
+        let set0 = process_set(pixel_row0, &function);
+        let set1 = process_set(pixel_row1, &function);
+        let set2 = process_set(pixel_row2, &function);
+        let set3 = process_set(pixel_row3, &function);
+
+        let dst_ptr = dst.add(dst_offset as usize + cx);
+
+        _mm_storeu_si128_x4(dst_ptr, (set0, set1, set2, set3));
+
+        cx += 64;
+    }
+
+    while cx + 16 < width as usize {
+        let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx);
+
+        let pixel_row = _mm_loadu_ps_x4(offset_src_ptr);
+        let r_row = process_set(pixel_row, &function);
+        let dst_ptr = dst.add(dst_offset as usize + cx);
+        _mm_storeu_si128(dst_ptr as *mut __m128i, r_row);
+
+        cx += 16;
+    }
+
+    cx
+}
diff --git a/src/sse/math.rs b/src/sse/math.rs
index a3defbc..94bfef1 100644
--- a/src/sse/math.rs
+++ b/src/sse/math.rs
@@ -3,6 +3,8 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
+use erydanos::_mm_pow_ps;
+
 #[inline(always)]
 pub unsafe fn _mm_cube_ps(x: __m128) -> __m128 {
     _mm_mul_ps(_mm_mul_ps(x, x), x)
@@ -42,40 +44,6 @@ unsafe fn _mm_taylorpoly_ps(
     return res;
 }
 
-#[inline(always)]
-pub unsafe fn _mm_log_ps<const HANDLE_NAN: bool>(v: __m128) -> __m128 {
-    let zeros = _mm_setzero_ps();
-    let nan_mask = _mm_cmple_ps(v, zeros);
-    let const_ln127 = _mm_set1_epi32(127); // 127
-    let const_ln2 = _mm_set1_ps(std::f32::consts::LN_2); // ln(2)
-
-    // Extract exponent
-    let m = _mm_sub_epi32(_mm_srli_epi32::<23>(_mm_castps_si128(v)), const_ln127);
-    let val = _mm_castsi128_ps(_mm_sub_epi32(_mm_castps_si128(v), _mm_slli_epi32::<23>(m)));
-
-    let mut poly = _mm_taylorpoly_ps(
-        val,
-        _mm_set1_ps(-2.29561495781f32),
-        _mm_set1_ps(-2.47071170807f32),
-        _mm_set1_ps(-5.68692588806f32),
-        _mm_set1_ps(-0.165253549814f32),
-        _mm_set1_ps(5.17591238022f32),
-        _mm_set1_ps(0.844007015228f32),
-        _mm_set1_ps(4.58445882797f32),
-        _mm_set1_ps(0.0141278216615f32),
-    );
-
-    poly = _mm_prefer_fma_ps(poly, _mm_cvtepi32_ps(m), const_ln2);
-
-    if HANDLE_NAN {
-        poly = _mm_select_ps(nan_mask, _mm_set1_ps(-f32::INFINITY), poly);
-    } else {
-        poly = _mm_select_ps(nan_mask, zeros, poly);
-    }
-
-    poly
-}
-
 #[inline(always)]
 pub unsafe fn _mm_select_ps(mask: __m128, true_vals: __m128, false_vals: __m128) -> __m128 {
     _mm_blendv_ps(false_vals, true_vals, mask)
@@ -96,131 +64,27 @@ pub unsafe fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __
     )
 }
 
-#[inline(always)]
-pub unsafe fn _mm_exp_ps(x: __m128) -> __m128 {
-    _mm_exp_ps_ulp_1_5::<false>(x)
-}
-
-#[inline(always)]
-pub unsafe fn _mm_exp_ps_ulp_1_5<const HANDLE_NAN: bool>(x: __m128) -> __m128 {
-    let c1 = _mm_castsi128_ps(_mm_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
-    let c2 = _mm_castsi128_ps(_mm_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
-    let c3 = _mm_castsi128_ps(_mm_set1_epi32(0x3e2aaf33)); // x^3: 0x1.555e66p-3f
-    let c4 = _mm_castsi128_ps(_mm_set1_epi32(0x3d2b9f17)); // x^4: 0x1.573e2ep-5f
-    let c5 = _mm_castsi128_ps(_mm_set1_epi32(0x3c072010)); // x^5: 0x1.0e4020p-7f
-
-    let shift = _mm_castsi128_ps(_mm_set1_epi32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
-    let inv_ln2 = _mm_castsi128_ps(_mm_set1_epi32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
-    let neg_ln2_hi = _mm_castsi128_ps(_mm_set1_epi32(-1087278592i32)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
-    let neg_ln2_lo = _mm_castsi128_ps(_mm_set1_epi32(-1245725042i32)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
-
-    // Range reduction:
-    //   e^x = 2^n * e^r
-    // where:
-    //   n = floor(x / ln(2))
-    //   r = x - n * ln(2)
-    //
-    // By adding x / ln(2) with 2^23 + 127 (shift):
-    //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 forces decimal part
-    //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n) + 127 will occupy
-    //     the whole fraction part of z in FP32 format.
-    //     Subtracting 2^23 + 127 (shift) from z will result in the integer part of x / ln(2)
-    //     (i.e. n) because the decimal part has been pushed out and lost.
-    //   * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
-    //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
-    let z = _mm_prefer_fma_ps(shift, x, inv_ln2);
-    let n = _mm_sub_ps(z, shift);
-    let scale = _mm_castsi128_ps(_mm_slli_epi32::<23>(_mm_castps_si128(z))); // 2^n
-
-    // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
-    // This outperforms longer Taylor series (3-4 tabs) both in terms of accuracy and performance.
-    let r_hi = _mm_prefer_fma_ps(x, n, neg_ln2_hi);
-    let r = _mm_prefer_fma_ps(r_hi, n, neg_ln2_lo);
-
-    // Compute the truncated Taylor series of e^r.
-    //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
-    let r2 = _mm_mul_ps(r, r);
-
-    let p1 = _mm_mul_ps(c1, r);
-    let p23 = _mm_prefer_fma_ps(c2, c3, r);
-    let p45 = _mm_prefer_fma_ps(c4, c5, r);
-    let p2345 = _mm_prefer_fma_ps(p23, p45, r2);
-    let p12345 = _mm_prefer_fma_ps(p1, p2345, r2);
-
-    let mut poly = _mm_prefer_fma_ps(scale, p12345, scale);
-
-    if HANDLE_NAN {
-        let inf = _mm_set1_ps(f32::INFINITY);
-        let max_input = _mm_set1_ps(88.37f32); // Approximately ln(2^127.5)
-        let zero = _mm_set1_ps(0f32);
-        let min_input = _mm_set1_ps(-86.64f32); // Approximately ln(2^-125)
-                                                // Handle underflow and overflow.
-        poly = _mm_select_ps(_mm_cmplt_ps(x, min_input), zero, poly);
-        poly = _mm_select_ps(_mm_cmpgt_ps(x, max_input), inf, poly);
-    }
-
-    return poly;
-}
-
-#[inline(always)]
-unsafe fn _mm_exp_ps_ulp_5_impl<const PROCESS_NAN: bool>(x: __m128) -> __m128 {
-    let l2e = _mm_set1_ps(std::f32::consts::LOG2_E); /* log2(e) */
-    let c0 = _mm_set1_ps(0.3371894346f32);
-    let c1 = _mm_set1_ps(0.657636276f32);
-    let c2 = _mm_set1_ps(1.00172476f32);
-
-    /* exp(x) = 2^i * 2^f; i = floor (log2(e) * x), 0 <= f <= 1 */
-    let t = _mm_mul_ps(x, l2e); /* t = log2(e) * x */
-    let e = _mm_floor_ps(t); /* floor(t) */
-    let i = _mm_cvtps_epi32(e); /* (int)floor(t) */
-    let f = _mm_sub_ps(t, e); /* f = t - floor(t) */
-    let mut p = c0; /* c0 */
-    p = _mm_prefer_fma_ps(c1, p, f); /* c0 * f + c1 */
-    p = _mm_prefer_fma_ps(c2, p, f); /* p = (c0 * f + c1) * f + c2 ~= 2^f */
-    let j = _mm_slli_epi32::<23>(i); /* i << 23 */
-    let r = _mm_castsi128_ps(_mm_add_epi32(j, _mm_castps_si128(p))); /* r = p * 2^i*/
-    if PROCESS_NAN {
-        let inf = _mm_set1_ps(f32::INFINITY);
-        let max_input = _mm_set1_ps(88.72283f32); // Approximately ln(2^127.5)
-        let min_input = _mm_set1_ps(-87.33654f32); // Approximately ln(2^-125)
-        let poly = _mm_select_ps(_mm_cmplt_ps(x, min_input), _mm_setzero_ps(), r);
-        let poly = _mm_select_ps(_mm_cmpgt_ps(x, max_input), inf, poly);
-        return poly;
-    } else {
-        return r;
-    }
-}
-
-#[inline(always)]
-pub unsafe fn _mm_pow_ps(x: __m128, n: __m128) -> __m128 {
-    _mm_exp_ps(_mm_mul_ps(n, _mm_log_ps::<false>(x)))
-}
-
 #[inline(always)]
 pub unsafe fn _mm_pow_n_ps(x: __m128, n: f32) -> __m128 {
-    _mm_exp_ps(_mm_mul_ps(_mm_set1_ps(n), _mm_log_ps::<false>(x)))
+    _mm_pow_ps(x, _mm_set1_ps(n))
 }
 
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm_signbit_ps(f: __m128) -> __m128i {
     return _mm_and_si128(_mm_castps_si128(f), _mm_castps_si128(_mm_set1_ps(-0.0f32)));
 }
 
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm_mulsign_ps(x: __m128, y: __m128) -> __m128 {
     return _mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(x), _mm_signbit_ps(y)));
 }
 
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm_pow2i_ps(q: __m128i) -> __m128 {
     return _mm_castsi128_ps(_mm_slli_epi32::<23>(_mm_add_epi32(q, _mm_set1_epi32(0x7f))));
 }
 
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm_vldexp2_ps(d: __m128, e: __m128i) -> __m128 {
     return _mm_mul_ps(
         _mm_mul_ps(d, _mm_pow2i_ps(_mm_srli_epi32::<1>(e))),
@@ -229,7 +93,6 @@ pub unsafe fn _mm_vldexp2_ps(d: __m128, e: __m128i) -> __m128 {
 }
 
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm_vilogbk_ps(d: __m128) -> __m128i {
     let o = _mm_cmplt_ps(d, _mm_set1_ps(5.421010862427522E-20f32));
     let d = _mm_select_ps(o, _mm_mul_ps(_mm_set1_ps(1.8446744073709552E19f32), d), d);
@@ -253,13 +116,6 @@ pub(crate) unsafe fn _mm_fmaf_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
     _mm_prefer_fma_ps(c, b, a)
 }
 
-#[inline(always)]
-#[allow(dead_code)]
-pub(crate) unsafe fn _mm_abs_ps(x: __m128) -> __m128 {
-    let sign_mask = _mm_set1_ps(-0f32);
-    return _mm_andnot_ps(sign_mask, x);
-}
-
 #[inline(always)]
 #[allow(dead_code)]
 pub(crate) unsafe fn _mm_neg_epi32(x: __m128i) -> __m128i {
@@ -273,64 +129,6 @@ pub(crate) unsafe fn _mm_neg_ps(x: __m128) -> __m128 {
     return _mm_sub_ps(high, x);
 }
 
-#[inline(always)]
-/// This is Cube Root using Pow functions,
-/// it is also precise however due to of inexact nature of power 1/3 result slightly differ
-/// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5
-pub unsafe fn _mm_cbrt_ps(d: __m128) -> __m128 {
-    _mm_cbrt_ulp2_ps::<false>(d)
-}
-
-#[inline(always)]
-#[allow(dead_code)]
-/// Precise version of Cube Root, ULP 3.5
-pub unsafe fn _mm_cbrt_ps_ulp35(d: __m128) -> __m128 {
-    let mut q = _mm_set1_ps(1f32);
-    let e = _mm_add_epi32(_mm_vilogbk_ps(_mm_abs_ps(d)), _mm_set1_epi32(1));
-    let mut d = _mm_vldexp2_ps(d, _mm_neg_epi32(e));
-
-    let t = _mm_add_ps(_mm_cvtepi32_ps(e), _mm_set1_ps(6144f32));
-    let qu = _mm_cvttps_epi32(_mm_mul_ps(t, _mm_set1_ps(1.0f32 / 3.0f32)));
-    let re = _mm_cvttps_epi32(_mm_sub_ps(
-        t,
-        _mm_mul_ps(_mm_cvtepi32_ps(qu), _mm_set1_ps(3f32)),
-    ));
-
-    q = _mm_selecti_ps(
-        _mm_cmpeq_epi32(re, _mm_set1_epi32(1)),
-        _mm_set1_ps(1.2599210498948731647672106f32),
-        q,
-    );
-    q = _mm_selecti_ps(
-        _mm_cmpeq_epi32(re, _mm_set1_epi32(2)),
-        _mm_set1_ps(1.5874010519681994747517056f32),
-        q,
-    );
-    q = _mm_vldexp2_ps(q, _mm_sub_epi32(qu, _mm_set1_epi32(2048)));
-    q = _mm_mulsign_ps(q, d);
-    d = _mm_abs_ps(d);
-
-    let mut x = _mm_set1_ps(-0.601564466953277587890625f32);
-    x = _mm_fmaf_ps(x, d, _mm_set1_ps(2.8208892345428466796875f32));
-    x = _mm_fmaf_ps(x, d, _mm_set1_ps(-5.532182216644287109375f32));
-    x = _mm_fmaf_ps(x, d, _mm_set1_ps(5.898262500762939453125f32));
-    x = _mm_fmaf_ps(x, d, _mm_set1_ps(-3.8095417022705078125f32));
-    x = _mm_fmaf_ps(x, d, _mm_set1_ps(2.2241256237030029296875f32));
-
-    let mut y = _mm_mul_ps(_mm_mul_ps(d, x), x);
-    y = _mm_mul_ps(
-        _mm_sub_ps(
-            y,
-            _mm_mul_ps(
-                _mm_mul_ps(y, _mm_set1_ps(2.0f32 / 3.0f32)),
-                _mm_fmaf_ps(y, x, _mm_set1_ps(-1.0f32)),
-            ),
-        ),
-        q,
-    );
-    return y;
-}
-
 #[inline(always)]
 pub unsafe fn _mm_cmpge_epi32(a: __m128i, b: __m128i) -> __m128i {
     let gt = _mm_cmpgt_epi32(a, b);
@@ -344,58 +142,6 @@ pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 #[inline(always)]
-/// Precise version of Cube Root with ULP 2
-pub unsafe fn _mm_cbrt_ulp2_ps<const HANDLE_NAN: bool>(x: __m128) -> __m128 {
-    let x1p24 = _mm_castsi128_ps(_mm_set1_epi32(0x4b800000)); // 0x1p24f === 2 ^ 24
-
-    let mut ui = _mm_cvtps_epi32(x);
-    let hx = _mm_and_si128(ui, _mm_set1_epi32(0x7fffffff));
-
-    let nan_mask = _mm_cmpge_epi32(hx, _mm_set1_epi32(0x7f800000));
-    let is_zero_mask = _mm_cmpeq_epi32(hx, _mm_setzero_si128());
-
-    let lo_mask = _mm_cmplt_epi32(hx, _mm_set1_epi32(0x00800000));
-    let hi_ui_f = _mm_castps_si128(_mm_mul_ps(x, x1p24));
-    let mut lo_hx = _mm_and_si128(hi_ui_f, _mm_set1_epi32(0x7fffffff));
-    let recpeq_3 = _mm_set1_ps(1f32 / 3f32);
-    lo_hx = _mm_add_epi32(
-        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(lo_hx), recpeq_3)),
-        _mm_set1_epi32(642849266),
-    );
-    let hi_hx = _mm_add_epi32(
-        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(hx), recpeq_3)),
-        _mm_set1_epi32(709958130),
-    );
-    let hx = _mm_select_si128(lo_mask, lo_hx, hi_hx);
-
-    ui = _mm_select_si128(lo_mask, hi_ui_f, ui);
-    ui = _mm_and_si128(ui, _mm_set1_epi32(-2147483648i32));
-    ui = _mm_or_si128(ui, hx);
-
-    let mut t = _mm_castsi128_ps(ui);
-    let mut r = _mm_mul_ps(_mm_mul_ps(t, t), t);
-
-    let sum_x = _mm_add_ps(x, x);
-
-    t = _mm_mul_ps(
-        _mm_div_ps(_mm_add_ps(sum_x, r), _mm_add_ps(_mm_add_ps(r, r), x)),
-        t,
-    );
-
-    r = _mm_mul_ps(_mm_mul_ps(t, t), t);
-    t = _mm_mul_ps(
-        _mm_div_ps(_mm_add_ps(sum_x, r), _mm_add_ps(_mm_add_ps(r, r), x)),
-        t,
-    );
-    if HANDLE_NAN {
-        t = _mm_selecti_ps(nan_mask, _mm_set1_ps(f32::NAN), t);
-        t = _mm_selecti_ps(is_zero_mask, _mm_setzero_ps(), t);
-    }
-    t
-}
-
-#[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm_color_matrix_ps(
     r: __m128,
     g: __m128,
@@ -416,70 +162,6 @@ pub unsafe fn _mm_color_matrix_ps(
     (new_r, new_g, new_b)
 }
 
-#[inline(always)]
-#[allow(dead_code)]
-pub(crate) unsafe fn _mm_fmod_ps(a: __m128, b: __m128) -> __m128 {
-    let dividend_vec = a;
-    let divisor_vec = b;
-    let division = _mm_mul_ps(dividend_vec, _mm_rcp_ps(divisor_vec)); // Perform division
-    let int_part = _mm_floor_ps(division); // Get the integer part using floor
-    let product = _mm_mul_ps(int_part, divisor_vec); // Multiply the integer part by the divisor
-    let remainder = _mm_sub_ps(dividend_vec, product); // Subtract the product from the dividend
-    remainder
-}
-
-#[inline(always)]
-#[allow(dead_code)]
-pub unsafe fn _mm_is_infinity(d: __m128) -> __m128 {
-    return _mm_cmpeq_ps(_mm_abs_ps(d), _mm_set1_ps(f32::INFINITY));
-}
-
-#[inline(always)]
-#[allow(dead_code)]
-pub unsafe fn _mm_cos_ps(d: __m128) -> __m128 {
-    let mut q = _mm_cvtps_epi32(_mm_sub_ps(
-        _mm_mul_ps(d, _mm_set1_ps(std::f32::consts::FRAC_1_PI)),
-        _mm_set1_ps(0.5f32),
-    ));
-
-    q = _mm_add_epi32(_mm_add_epi32(q, q), _mm_set1_epi32(1));
-
-    let mut u = _mm_cvtepi32_ps(q);
-    let mut d = _mm_fmaf_ps(u, _mm_set1_ps(-0.78515625f32 * 2f32), d);
-    d = _mm_fmaf_ps(u, _mm_set1_ps(-0.00024187564849853515625f32 * 2f32), d);
-    d = _mm_fmaf_ps(u, _mm_set1_ps(-3.7747668102383613586e-08f32 * 2f32), d);
-    d = _mm_fmaf_ps(u, _mm_set1_ps(-1.2816720341285448015e-12f32 * 2f32), d);
-
-    let s = _mm_mul_ps(d, d);
-
-    d = _mm_castsi128_ps(_mm_xor_si128(
-        _mm_and_si128(
-            _mm_cmpeq_epi32(_mm_and_si128(q, _mm_set1_epi32(2)), _mm_set1_epi32(0)),
-            _mm_castps_si128(_mm_set1_ps(-0.0f32)),
-        ),
-        _mm_castps_si128(d),
-    ));
-
-    u = _mm_set1_ps(2.6083159809786593541503e-06f32);
-    u = _mm_fmaf_ps(u, s, _mm_set1_ps(-0.0001981069071916863322258f32));
-    u = _mm_fmaf_ps(u, s, _mm_set1_ps(0.00833307858556509017944336f32));
-    u = _mm_fmaf_ps(u, s, _mm_set1_ps(-0.166666597127914428710938f32));
-
-    u = _mm_fmaf_ps(s, _mm_mul_ps(u, d), d);
-
-    u = _mm_or_ps(_mm_is_infinity(d), u);
-
-    return u;
-}
-
-#[inline(always)]
-pub unsafe fn _mm_hypot_ps(x: __m128, y: __m128) -> __m128 {
-    let xp2 = _mm_mul_ps(x, x);
-    let yp2 = _mm_mul_ps(y, y);
-    let z = _mm_add_ps(xp2, yp2);
-    return _mm_sqrt_ps(z);
-}
-
 #[inline(always)]
 pub unsafe fn _mm_poly4_ps(
     x: __m128,
@@ -512,106 +194,3 @@ pub unsafe fn _mm_poly8q_ps(
         _mm_poly4_ps(x, x2, c3, c2, c1, c0),
     )
 }
-
-#[inline(always)]
-unsafe fn _mm_atan2q_ps_impl(y: __m128, x: __m128) -> __m128 {
-    let q = _mm_select_si128(
-        _mm_castps_si128(_mm_cmplt_ps(x, _mm_setzero_ps())),
-        _mm_set1_epi32(-2),
-        _mm_set1_epi32(0),
-    );
-    let x = _mm_abs_ps(x);
-    let is_y_more_than_x = _mm_cmpgt_ps(y, x);
-    let t = _mm_select_ps(is_y_more_than_x, x, _mm_setzero_ps());
-    let x = _mm_select_ps(is_y_more_than_x, y, x);
-    let y = _mm_select_ps(is_y_more_than_x, _mm_neg_ps(t), y);
-    let q = _mm_select_si128(
-        _mm_castps_si128(is_y_more_than_x),
-        _mm_add_epi32(q, _mm_set1_epi32(1)),
-        q,
-    );
-    let s = _mm_div_ps(y, x);
-    let t = _mm_mul_ps(s, s);
-    let t2 = _mm_mul_ps(t, t);
-    let t4 = _mm_mul_ps(t2, t2);
-    let poly = _mm_poly8q_ps(
-        t,
-        t2,
-        t4,
-        _mm_set1_ps(0.00282363896258175373077393f32),
-        _mm_set1_ps(-0.0159569028764963150024414f32),
-        _mm_set1_ps(0.0425049886107444763183594f32),
-        _mm_set1_ps(-0.0748900920152664184570312f32),
-        _mm_set1_ps(0.106347933411598205566406f32),
-        _mm_set1_ps(-0.142027363181114196777344f32),
-        _mm_set1_ps(0.199926957488059997558594f32),
-        _mm_set1_ps(-0.333331018686294555664062f32),
-    );
-    let t = _mm_prefer_fma_ps(s, _mm_mul_ps(poly, t), s);
-    let t = _mm_prefer_fma_ps(
-        t,
-        _mm_cvtepi32_ps(q),
-        _mm_set1_ps(std::f32::consts::FRAC_PI_2),
-    );
-    t
-}
-
-#[inline(always)]
-pub unsafe fn _mm_atan2_ps(y: __m128, x: __m128) -> __m128 {
-    let r = _mm_atan2q_ps_impl(_mm_abs_ps(y), x);
-    let mut r = _mm_mulsign_ps(r, x);
-    let zeros = _mm_setzero_ps();
-    let y_zero_mask = _mm_cmpeq_ps(y, zeros);
-    r = _mm_select_ps(
-        _mm_cmpeq_ps(x, zeros),
-        _mm_set1_ps(std::f32::consts::FRAC_PI_2),
-        r,
-    );
-    r = _mm_select_ps(y_zero_mask, zeros, r);
-    _mm_mulsign_ps(r, y)
-}
-
-#[inline(always)]
-pub unsafe fn _mm_sin_ps(val: __m128) -> __m128 {
-    let pi_v = _mm_set1_ps(std::f32::consts::PI);
-    let pio2_v = _mm_set1_ps(std::f32::consts::FRAC_PI_2);
-    let ipi_v = _mm_set1_ps(std::f32::consts::FRAC_1_PI);
-
-    //Find positive or negative
-    let c_v = _mm_abs_epi32(_mm_cvtps_epi32(_mm_mul_ps(val, ipi_v)));
-    let sign_v = _mm_castps_si128(_mm_cmple_ps(val, _mm_setzero_ps()));
-    let odd_v = _mm_and_si128(c_v, _mm_set1_epi32(1));
-
-    let neg_v = _mm_xor_si128(odd_v, sign_v);
-
-    //Modulus a - (n * int(a*(1/n)))
-    let mut ma = _mm_sub_ps(_mm_abs_ps(val), _mm_mul_ps(pi_v, _mm_cvtepi32_ps(c_v)));
-    let reb_v = _mm_cmpge_ps(ma, pio2_v);
-
-    //Rebase a between 0 and pi/2
-    ma = _mm_select_ps(reb_v, _mm_sub_ps(pi_v, ma), ma);
-
-    //Taylor series
-    let ma2 = _mm_mul_ps(ma, ma);
-
-    //2nd elem: x^3 / 3!
-    let mut elem = _mm_mul_ps(_mm_mul_ps(ma, ma2), _mm_set1_ps(0.166666666666f32));
-    let mut res = _mm_sub_ps(ma, elem);
-
-    //3rd elem: x^5 / 5!
-    elem = _mm_mul_ps(_mm_mul_ps(elem, ma2), _mm_set1_ps(0.05f32));
-    res = _mm_add_ps(res, elem);
-
-    //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
-    elem = _mm_mul_ps(_mm_mul_ps(elem, ma2), _mm_set1_ps(0.023809523810f32));
-    res = _mm_sub_ps(res, elem);
-
-    //5th elem: x^9 / 9!
-    elem = _mm_mul_ps(_mm_mul_ps(elem, ma2), _mm_set1_ps(0.013888888889f32));
-    res = _mm_add_ps(res, elem);
-
-    //Change of sign
-    let neg_v = _mm_slli_epi32::<31>(neg_v);
-    res = _mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(res), neg_v));
-    return res;
-}
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index 024129b..4803b21 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -36,6 +36,8 @@ mod from_sigmoidal;
 mod sigmoidal;
 mod to_sigmoidal;
 mod xyza_laba_to_image;
+mod planar_to_linear;
+mod linear_to_planar;
 
 pub use from_sigmoidal::sse_from_sigmoidal_row;
 pub use gamma_curves::*;
@@ -51,3 +53,5 @@ pub use to_xyz_lab::*;
 pub use to_xyza_laba::*;
 pub use xyz_lab_to_image::*;
 pub use xyza_laba_to_image::*;
+pub use planar_to_linear::sse_plane_to_linear;
+pub use linear_to_planar::sse_linear_plane_to_gamma;
\ No newline at end of file
diff --git a/src/sse/planar_to_linear.rs b/src/sse/planar_to_linear.rs
new file mode 100644
index 0000000..4eba72d
--- /dev/null
+++ b/src/sse/planar_to_linear.rs
@@ -0,0 +1,86 @@
+use crate::sse::{_mm_loadu_si128_x4, _mm_storeu_ps_x4, get_sse_linear_transfer};
+use crate::TransferFunction;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[inline(always)]
+unsafe fn sse_to_linear(r: __m128i, transfer: &unsafe fn(__m128) -> __m128) -> __m128 {
+    let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), _mm_set1_ps(1f32 / 255f32));
+    transfer(r_f)
+}
+
+#[inline]
+unsafe fn process_pixels(
+    pixels: __m128i,
+    transfer: &unsafe fn(__m128) -> __m128,
+) -> (__m128, __m128, __m128, __m128) {
+    let zeros = _mm_setzero_si128();
+    let r_low = _mm_unpacklo_epi8(pixels, zeros);
+
+    let r_low_low = _mm_unpacklo_epi16(r_low, zeros);
+
+    let x_low_low = sse_to_linear(r_low_low, &transfer);
+
+    let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
+
+    let x_low_high = sse_to_linear(r_low_high, &transfer);
+
+    let r_high = _mm_unpackhi_epi8(pixels, zeros);
+
+    let r_high_low = _mm_unpacklo_epi16(r_high, zeros);
+
+    let x_high_low = sse_to_linear(r_high_low, &transfer);
+
+    let r_high_high = _mm_unpackhi_epi16(r_high, zeros);
+
+    let x_high_high = sse_to_linear(r_high_high, &transfer);
+
+    (x_low_low, x_low_high, x_high_low, x_high_high)
+}
+
+#[inline(always)]
+pub unsafe fn sse_plane_to_linear(
+    start_cx: usize,
+    src: *const u8,
+    src_offset: usize,
+    width: u32,
+    dst: *mut f32,
+    dst_offset: usize,
+    transfer_function: TransferFunction,
+) -> usize {
+    let mut cx = start_cx;
+    let transfer = get_sse_linear_transfer(transfer_function);
+
+    let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
+
+    while cx + 64 < width as usize {
+        let src_ptr = src.add(src_offset + cx);
+        let pixels_row64 = _mm_loadu_si128_x4(src_ptr);
+        let storing_row0 = process_pixels(pixels_row64.0, &transfer);
+        _mm_storeu_ps_x4(dst_ptr.add(cx), storing_row0);
+
+        let storing_row1 = process_pixels(pixels_row64.1, &transfer);
+        _mm_storeu_ps_x4(dst_ptr.add(cx + 16), storing_row1);
+
+        let storing_row2 = process_pixels(pixels_row64.2, &transfer);
+        _mm_storeu_ps_x4(dst_ptr.add(cx + 32), storing_row2);
+
+        let storing_row3 = process_pixels(pixels_row64.3, &transfer);
+        _mm_storeu_ps_x4(dst_ptr.add(cx + 48), storing_row3);
+
+        cx += 64;
+    }
+
+    while cx + 16 < width as usize {
+        let src_ptr = src.add(src_offset + cx);
+        let pixels = _mm_loadu_si128(src_ptr as *const __m128i);
+        let storing_row = process_pixels(pixels, &transfer);
+        _mm_storeu_ps_x4(dst_ptr.add(cx), storing_row);
+
+        cx += 16;
+    }
+
+    cx
+}
diff --git a/src/sse/sigmoidal.rs b/src/sse/sigmoidal.rs
index c1a13c7..f6916a7 100644
--- a/src/sse/sigmoidal.rs
+++ b/src/sse/sigmoidal.rs
@@ -1,8 +1,9 @@
-use crate::sse::{_mm_exp_ps, _mm_log_ps, _mm_neg_ps, _mm_select_ps};
+use crate::sse::{_mm_neg_ps, _mm_select_ps};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
+use erydanos::{_mm_exp_ps, _mm_ln_fast_ps};
 
 #[inline(always)]
 pub(crate) unsafe fn sse_color_to_sigmoidal(x: __m128) -> __m128 {
@@ -21,7 +22,7 @@ pub(crate) unsafe fn sse_sigmoidal_to_color(x: __m128) -> __m128 {
     let k = _mm_mul_ps(x, _mm_rcp_ps(den));
     let zeros = _mm_setzero_ps();
     let zero_mask_2 = _mm_cmple_ps(k, zeros);
-    let ln = _mm_log_ps::<false>(k);
+    let ln = _mm_ln_fast_ps(k);
     let rs = _mm_select_ps(_mm_and_ps(zero_mask_1, zero_mask_2), zeros, ln);
     return rs;
 }
diff --git a/src/sse/support.rs b/src/sse/support.rs
index 340ee75..b05b5ef 100644
--- a/src/sse/support.rs
+++ b/src/sse/support.rs
@@ -8,27 +8,7 @@ pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
     ((z << 6) | (y << 4) | (x << 2) | w) as i32
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-#[inline(always)]
-#[allow(dead_code)]
-pub unsafe fn sse_promote_i16_toi32(s: __m128i) -> __m128i {
-    _mm_cvtepi16_epi32(_mm_srli_si128::<8>(s))
-}
-
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
-pub unsafe fn sse_interleave_even(x: __m128i) -> __m128i {
-    #[rustfmt::skip]
-        let shuffle = _mm_setr_epi8(0, 0, 2, 2, 4, 4, 6, 6,
-                                    8, 8, 10, 10, 12, 12, 14, 14);
-    let new_lane = _mm_shuffle_epi8(x, shuffle);
-    return new_lane;
-}
-
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-#[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn sse_interleave_rgba(
     r: __m128i,
     g: __m128i,
@@ -355,17 +335,6 @@ pub unsafe fn sse_store_rgb_u8(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i)
     _mm_storeu_si128(ptr.add(32) as *mut __m128i, v2);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-#[inline(always)]
-#[allow(dead_code)]
-pub unsafe fn sse_div_by255(v: __m128i) -> __m128i {
-    let rounding = _mm_set1_epi16(1 << 7);
-    let x = _mm_adds_epi16(v, rounding);
-    let multiplier = _mm_set1_epi16(-32640);
-    let r = _mm_mulhi_epu16(x, multiplier);
-    return _mm_srli_epi16::<7>(r);
-}
-
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 #[allow(dead_code)]
@@ -385,3 +354,39 @@ pub unsafe fn sse_deinterleave_rgba_ps(
     let v3 = _mm_unpackhi_ps(t02hi, t13hi);
     (v0, v1, v2, v3)
 }
+
+#[inline(always)]
+pub unsafe fn _mm_loadu_si128_x4(ptr: *const u8) -> (__m128i, __m128i, __m128i, __m128i) {
+    (
+        _mm_loadu_si128(ptr as *const __m128i),
+        _mm_loadu_si128(ptr.add(16) as *const __m128i),
+        _mm_loadu_si128(ptr.add(32) as *const __m128i),
+        _mm_loadu_si128(ptr.add(48) as *const __m128i),
+    )
+}
+
+#[inline(always)]
+pub unsafe fn _mm_storeu_ps_x4(ptr: *mut f32, set: (__m128, __m128, __m128, __m128)) {
+    _mm_storeu_ps(ptr, set.0);
+    _mm_storeu_ps(ptr.add(4), set.1);
+    _mm_storeu_ps(ptr.add(8), set.2);
+    _mm_storeu_ps(ptr.add(12), set.3);
+}
+
+#[inline(always)]
+pub unsafe fn _mm_loadu_ps_x4(ptr: *const f32) -> (__m128, __m128, __m128, __m128) {
+    (
+        _mm_loadu_ps(ptr),
+        _mm_loadu_ps(ptr.add(4)),
+        _mm_loadu_ps(ptr.add(8)),
+        _mm_loadu_ps(ptr.add(12)),
+    )
+}
+
+#[inline(always)]
+pub unsafe fn _mm_storeu_si128_x4(ptr: *mut u8, set: (__m128i, __m128i, __m128i, __m128i)) {
+    _mm_storeu_si128(ptr as * mut __m128i, set.0);
+    _mm_storeu_si128(ptr.add(16) as * mut __m128i, set.1);
+    _mm_storeu_si128(ptr.add(32) as * mut __m128i, set.2);
+    _mm_storeu_si128(ptr.add(48) as * mut __m128i, set.3);
+}
\ No newline at end of file